From c3d5164c181d0ba0368df7514d1824dedc5d8fcf Mon Sep 17 00:00:00 2001
From: Sting <loic.allegre@ensiie.fr>
Date: Fri, 21 Jul 2023 11:08:24 +0200
Subject: [PATCH] Refactor

---
 autokara.py        |   6 +-
 autosyl/segment.py | 297 ++-------------------------------------------
 plot_syls.py       | 184 ++++++++++++++++++++++++++++
 3 files changed, 196 insertions(+), 291 deletions(-)
 create mode 100644 plot_syls.py

diff --git a/autokara.py b/autokara.py
index f4d3ba6..1da2d81 100644
--- a/autokara.py
+++ b/autokara.py
@@ -41,12 +41,12 @@ else:
     vocals_file = args.source_file
 
 if args.ref:
-    reference_syls = getSyls(args.ref)
+    reference_file = args.ref
 else:
-    reference_syls = None
+    reference_file = None
 
 print("Identifying syl starts...")
-syls = segment(vocals_file, reference_syls=reference_syls)
+syls = segment(vocals_file, reference_file=reference_file)
 print(syls)
 
 print("Syls found, writing ASS file...")
diff --git a/autosyl/segment.py b/autosyl/segment.py
index 649b1f1..b237350 100644
--- a/autosyl/segment.py
+++ b/autosyl/segment.py
@@ -11,11 +11,17 @@ from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file
 
 
 
-def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500):
+def segment(songfile, reference_file=None, syls_per_line=10, last_syl_dur=500):
 
     delay = -4
     backtrack = False
 
+    if reference_file:
+        reference_syls = getSyls(reference_file)
+    else:
+        reference_syls = []
+
+    print(reference_syls)
 
     audio_file = songfile                      # pre-computed source-separated vocals; These models do not work with mixture input.
     word_file = None                           # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it
@@ -46,87 +52,6 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500):
     words_onsets = np.array([word_align[i][0] for i in range(len(word_align))])
     print(words_onsets)
 
-    '''
-    cnn = madmom.features.onsets.CNNOnsetProcessor()
-    spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler')
-
-    sig = madmom.audio.signal.Signal(songfile, num_channels=1)
-    parsel = parselmouth.Sound(sig)
-
-    spec = madmom.audio.spectrogram.Spectrogram(sig)
-    filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24)
-    log_spec = madmom.audio.spectrogram.LogarithmicSpectrogram(filt_spec, add=1)
-
-    magnitude = np.max(log_spec[:,:100], axis=1)
-
-    cnn_function = cnn(sig)
-    spectral_function = spectral(sig)
-    spectral_function = spectral_function/(spectral_function.max())
-    
-    #activation_function = 0.5*cnn_function + 0.5*spectral_function
-    activation_function = (2 * cnn_function * spectral_function)/(cnn_function + spectral_function)
-    #activation_function = np.where(spectral_function > 0.14, cnn_function, 0)
-    #onsets = proc(activation_function)
-    
-    if reference_syls:
-        activation_threshold = 0.1
-    else:
-        activation_threshold = 0.2
-
-
-    activation_smoothed = madmom.audio.signal.smooth(activation_function, 20)
-    cnn_smoothed = madmom.audio.signal.smooth(cnn_function, 20)
-    onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=activation_threshold, smooth=0)
-    #onsets = np.array([o for o in onsets if cnn_smoothed[o] > activation_threshold])
-
-    pitch = parsel.to_pitch()
-    pitch_values = pitch.selected_array['frequency']
-
-    pad_before = round(pitch.xs()[0]*100)
-    pad_after = len(magnitude) - len(pitch_values) - pad_before
-
-    pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0))
-
-    mask_function = magnitude * pitch_values
-    mask_function = mask_function/np.max(mask_function)
-    mask_threshold = 0.15
-    mask_window = [1,6]
-    invalid_onsets_idx = []
-    
-    for i in range(len(onsets)):
-        if np.max(mask_function[onsets[i]+mask_window[0]:onsets[i]+mask_window[1]]) < mask_threshold:
-            invalid_onsets_idx.append(i)
-    
-    onsets = np.delete(onsets, invalid_onsets_idx)
-
-
-    if reference_syls:
-        filtered_onsets = []
-        line_index = 0
-        for line in reference_syls:
-            line_index += 1
-            syl_number = len(line) - 1
-            line_onsets = [o for o in onsets if (o >= line[0][0] and o <= line[-1][0])]
-            line_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x]))
-            missing_syls = 0
-            if syl_number > len(line_onsets):
-                print("WARNING : failed to detect enough onsets in line %d (%d, %d)" % (line_index, line[0][0], line[-1][0]))
-                missing_syls = syl_number - len(line_onsets)
-            filtered_onsets += line_onsets[0:syl_number]
-            filtered_onsets += [line[-1][0] for i in range(missing_syls)] # If missing some syllables, pad with 0-length syls
-        
-        onsets = np.array(sorted(filtered_onsets))
-
-
-    if backtrack:
-        # Backtrack onsets to closest earlier local minimum
-        backtrack_max_frames = 50
-        for i in range(len(onsets)):
-            initial_onset = onsets[i]
-            while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames):
-                onsets[i] -= 1
-    '''
-
     onsets = words_onsets
     onsets = (onsets + delay)/100
     #print(onsets)
@@ -136,8 +61,8 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500):
         onset_index = 0
         for line in reference_syls:
             #print(onset_index, " : ", line)
-            l = [[onsets[onset_index + i], words[onset_index + i]] for i in range(len(line)-1)]
-            l.append([line[-1][0]/100, ''])
+            l = [[onsets[onset_index + i], line[i][1]] for i in range(len(line)-1)]
+            l.append([word_align[onset_index + (len(line) - 2)][1]/100, ''])
             syls.append(l)
             onset_index += (len(line) - 1)
     else:
@@ -158,207 +83,3 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500):
     return syls
 
 
-
-if __name__ == "__main__":
-
-    def dateToTime(date):
-        """
-        The `date` should be in the following format: H:MM:SS.cs
-        """
-        hourInMinuts = int(date[0:1]) * 60
-        minutsInSeconds = (int(date[2:4]) + hourInMinuts) * 60
-        secondsInCentiseconds = (int(date[5:7]) + minutsInSeconds) * 100
-        return int(date[8:10]) + secondsInCentiseconds
-
-
-    def getSyls(ass_file):
-        SYLS = []
-        with open(ass_file, 'r') as f:
-            CONTENT = f.read()
-            LINES_KARA = re.compile(r"Comment:.*(\d+:\d{2}:\d{2}.\d{2}),(\d+:\d{2}:\d{2}.\d{2}),.*,karaoke,(.*)\n");
-            RGX_TAGS = re.compile(r"\{\\k(\d+)\}([^\{\n\r]*)")
-            for line in LINES_KARA.findall(CONTENT):
-                syl_line = []
-                lastTime = dateToTime(line[0])
-                for couple in RGX_TAGS.findall(line[2]):
-                    syl_line.append((lastTime, couple[1], int(couple[0])))
-                    lastTime += int(couple[0])
-                syl_line.append([lastTime, '', 0])
-                SYLS.append(syl_line)
-        return SYLS
-
-
-
-
-    songfile = sys.argv[1]
-    if(len(sys.argv) >= 3):
-        reference_syls = getSyls(sys.argv[2])
-    else:
-        reference_syls = None
-    
-    print(reference_syls)
-
-    backtrack = False
-
-
-    
-
-    audio_file = songfile                      # pre-computed source-separated vocals; These models do not work with mixture input.
-    word_file = None                           # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it
-    method = "MTL_BDR"                             # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR"
-    cuda=True                                 # set True if you have access to a GPU
-    checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints"
-
-    pred_file = "./MTL.csv"                    # saved alignment results, "(float) start_time, (float) end_time, (string) word"
-
-
-    lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls]
-    #print(lyrics_lines)
-
-
-    # load audio and lyrics
-    # words:        a list of words
-    # lyrics_p:     phoneme sequence of the target lyrics
-    # idx_word_p:   indices of word start in lyrics_p
-    # idx_line_p:   indices of line start in lyrics_p
-    audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file)
-
-    # compute alignment
-    # word_align:   a list of frame indices aligned to each word
-    # words:        a list of words
-    word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder)
-
-
-    print([[word_align[i][0], word_align[i][1], words[i]] for i in range(len(word_align))])
-
-
-    cnn = madmom.features.onsets.CNNOnsetProcessor()
-    spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler')
-
-    sig = madmom.audio.signal.Signal(songfile, num_channels=1)
-    parsel = parselmouth.Sound(sig)
-
-    spec = madmom.audio.spectrogram.Spectrogram(sig)
-    filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24)
-    log_spec = madmom.audio.spectrogram.LogarithmicSpectrogram(filt_spec, add=1)
-
-    magnitude = np.max(log_spec[:,:100], axis=1)
-
-    cnn_function = cnn(sig)
-    spectral_function = spectral(sig)
-    spectral_function = spectral_function/(spectral_function.max())
-    
-    #activation_function = 0.5*cnn_function + 0.5*spectral_function
-    activation_function = (2 * cnn_function * spectral_function)/(cnn_function + spectral_function)
-    #activation_function = np.where(spectral_function > 0.14, cnn_function, 0)
-    #onsets = proc(activation_function)
-    
-
-    if reference_syls:
-        activation_threshold = 0.1
-    else:
-        activation_threshold = 0.2
-
-    activation_smoothed = madmom.audio.signal.smooth(activation_function, 20)
-    cnn_smoothed = madmom.audio.signal.smooth(cnn_function, 20)
-    onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=activation_threshold, smooth=0)
-    #onsets = np.array([o for o in onsets if cnn_smoothed[o] > 0.1])
-
-    pitch = parsel.to_pitch()
-    pitch_values = pitch.selected_array['frequency']
-
-    pad_before = round(pitch.xs()[0]*100)
-    pad_after = len(magnitude) - len(pitch_values) - pad_before
-
-    pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0))
-
-    mask_function = magnitude * pitch_values
-    mask_function = mask_function/np.max(mask_function)
-    mask_threshold = 0.15
-    mask_window = [1,6]
-    invalid_onsets_idx = []
-    
-    for i in range(len(onsets)):
-        if np.max(mask_function[onsets[i]+mask_window[0]:onsets[i]+mask_window[1]]) < mask_threshold:
-            invalid_onsets_idx.append(i)
-    
-    onsets = np.delete(onsets, invalid_onsets_idx)
-
-
-    if reference_syls:
-        filtered_onsets = []
-        word_index = 0
-        previous_onset = 0
-        for line in reference_syls:
-            word_line = []
-            word_tmp = ""
-            word_syl_count = 0
-            for syl_index in range(len(line) - 1):
-                word_tmp += line[syl_index][1].strip()
-                word_syl_count += 1
-                #print("%s : %s" % (word_tmp, words[word_index]))
-                if(word_tmp == words[word_index]):
-                    print("Word %d recognized : %s" % (word_index, word_tmp))
-                    word_line.append([word_align[word_index][0], word_align[word_index][1], words[word_index], word_syl_count, 0])
-                    word_index += 1
-                    word_tmp = ""
-                    word_syl_count = 0 
-
-                num_words_done = 0
-                while(num_words_done < len(word_line)):
-                    for word in range(len(word_line)):
-                        window_start = word[0]
-
-            print(word_line)
-
-        onsets = np.array(sorted(filtered_onsets))
-
-    """ 
-        if word_index > 0:
-            word_start = max(word_align[word_index][0] - 5, line[0][0], previous_onset+1)
-        else:
-            word_start = line[0][0]
-        if word_index < len(words) - 1 and syl_index < len(line) - 2:
-            word_end = min(line[-1][0], word_align[word_index + 1][0] - 5)
-        else:
-            word_end = line[-1][0]
-
-        word_onsets = [o for o in onsets if (o >= word_start and o <= word_end)]
-        word_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x]))
-        if word_syl_count > len(word_onsets):
-            print("WARNING : failed to detect enough onsets in word %s (%d, %d)" % (word_tmp, word_start, word_end))
-        filtered_onsets += word_onsets[0:word_syl_count]
-        print(word_onsets[0:word_syl_count])
-        previous_onset = max(word_onsets[0:word_syl_count] + [0])
-    """
-
-    # Backtrack onsets to closest earlier local minimum
-    if backtrack:
-        backtrack_max_frames = 50
-        for i in range(len(onsets)):
-            initial_onset = onsets[i]
-            while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames):
-                onsets[i] -= 1
-
-    print(onsets/100)
-
-    if reference_syls:
-        reference_onsets = [syl[0]+8 for line in reference_syls for syl in line[:-1]]
-
-    fig, axs = plt.subplots(nrows=2, sharex=True)
-    axs[0].imshow(log_spec.T, origin='lower', aspect='auto')
-    if reference_syls:
-        axs[0].vlines(reference_onsets, 0, 140, colors='red')
-    axs[0].plot((pitch_values/np.max(pitch_values))*140, color='yellow')
-    axs[1].plot(mask_function)
-    #axs[1].plot(cnn_smoothed)
-    #axs[1].plot(spectral_function, color='green')
-    axs[1].plot(activation_smoothed, color='orange')
-    axs[1].vlines(onsets, 0, 2, colors='red')
-    axs[1].hlines([max(mask_threshold, 0), activation_threshold], 0, onsets[-1]+100, colors='black')
-
-    #bins = np.arange(0, 1, 0.02)
-    #hist, hist_axs = plt.subplots(nrows=1)
-    #hist_axs.hist(mask_function, bins=bins)
-
-    plt.show()
diff --git a/plot_syls.py b/plot_syls.py
new file mode 100644
index 0000000..527c1e4
--- /dev/null
+++ b/plot_syls.py
@@ -0,0 +1,184 @@
+import madmom
+import numpy as np
+import sys
+import re
+import matplotlib.pyplot as plt
+import scipy.signal as sg
+import parselmouth
+
+from autosyl.assUtils import getSyls, timeToDate, dateToTime 
+from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file
+
+
+##############################################################################
+#
+# This is a test script to visualize extracted onsets and other audio features
+# It is mainly intended for development/debug
+#
+# If you just want to detect the syllables, use autokara.py instead
+#
+##############################################################################
+
+songfile = sys.argv[1]
+if(len(sys.argv) >= 3):
+    reference_syls = getSyls(sys.argv[2])
+else:
+    reference_syls = None
+
+print(reference_syls)
+
+backtrack = False
+
+
+
+
+audio_file = songfile                      # pre-computed source-separated vocals; These models do not work with mixture input.
+word_file = None                           # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it
+method = "MTL_BDR"                             # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR"
+cuda=True                                 # set True if you have access to a GPU
+checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints"
+
+pred_file = "./MTL.csv"                    # saved alignment results, "(float) start_time, (float) end_time, (string) word"
+
+
+lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls]
+#print(lyrics_lines)
+
+
+# load audio and lyrics
+# words:        a list of words
+# lyrics_p:     phoneme sequence of the target lyrics
+# idx_word_p:   indices of word start in lyrics_p
+# idx_line_p:   indices of line start in lyrics_p
+audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file)
+
+# compute alignment
+# word_align:   a list of frame indices aligned to each word
+# words:        a list of words
+word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder)
+
+
+print([[word_align[i][0], word_align[i][1], words[i]] for i in range(len(word_align))])
+words_onsets = np.array([word_align[i][0] for i in range(len(word_align))])
+
+
+cnn = madmom.features.onsets.CNNOnsetProcessor()
+spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler')
+
+sig = madmom.audio.signal.Signal(songfile, num_channels=1)
+parsel = parselmouth.Sound(sig)
+
+spec = madmom.audio.spectrogram.Spectrogram(sig)
+filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24)
+log_spec = madmom.audio.spectrogram.LogarithmicSpectrogram(filt_spec, add=1)
+
+magnitude = np.max(log_spec[:,:100], axis=1)
+
+cnn_function = cnn(sig)
+spectral_function = spectral(sig)
+spectral_function = spectral_function/(spectral_function.max())
+
+#activation_function = 0.5*cnn_function + 0.5*spectral_function
+activation_function = (2 * cnn_function * spectral_function)/(cnn_function + spectral_function)
+#activation_function = np.where(spectral_function > 0.14, cnn_function, 0)
+#onsets = proc(activation_function)
+
+
+if reference_syls:
+    activation_threshold = 0.1
+else:
+    activation_threshold = 0.2
+
+activation_smoothed = madmom.audio.signal.smooth(activation_function, 20)
+cnn_smoothed = madmom.audio.signal.smooth(cnn_function, 20)
+onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=activation_threshold, smooth=0)
+#onsets = np.array([o for o in onsets if cnn_smoothed[o] > 0.1])
+
+pitch = parsel.to_pitch()
+pitch_values = pitch.selected_array['frequency']
+
+pad_before = round(pitch.xs()[0]*100)
+pad_after = len(magnitude) - len(pitch_values) - pad_before
+
+pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0))
+
+mask_function = magnitude * pitch_values
+mask_function = mask_function/np.max(mask_function)
+mask_threshold = 0.15
+mask_window = [1,6]
+invalid_onsets_idx = []
+
+for i in range(len(onsets)):
+    if np.max(mask_function[onsets[i]+mask_window[0]:onsets[i]+mask_window[1]]) < mask_threshold:
+        invalid_onsets_idx.append(i)
+
+onsets = np.delete(onsets, invalid_onsets_idx)
+
+
+
+if reference_syls:
+    filtered_onsets = []
+    line_index = 0
+    for line in reference_syls:
+        line_index += 1
+        syl_number = len(line) - 1
+        line_onsets = [o for o in onsets if (o >= line[0][0] and o <= line[-1][0])]
+        line_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x]))
+        if syl_number > len(line_onsets):
+            print("WARNING : failed to detect enough onsets in line %d (%d, %d)" % (line_index, line[0][0], line[-1][0]))
+        filtered_onsets += line_onsets[0:syl_number]
+    
+    onsets = np.array(sorted(filtered_onsets))
+
+
+""" 
+    if word_index > 0:
+        word_start = max(word_align[word_index][0] - 5, line[0][0], previous_onset+1)
+    else:
+        word_start = line[0][0]
+    if word_index < len(words) - 1 and syl_index < len(line) - 2:
+        word_end = min(line[-1][0], word_align[word_index + 1][0] - 5)
+    else:
+        word_end = line[-1][0]
+
+    word_onsets = [o for o in onsets if (o >= word_start and o <= word_end)]
+    word_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x]))
+    if word_syl_count > len(word_onsets):
+        print("WARNING : failed to detect enough onsets in word %s (%d, %d)" % (word_tmp, word_start, word_end))
+    filtered_onsets += word_onsets[0:word_syl_count]
+    print(word_onsets[0:word_syl_count])
+    previous_onset = max(word_onsets[0:word_syl_count] + [0])
+"""
+
+# Backtrack onsets to closest earlier local minimum
+if backtrack:
+    backtrack_max_frames = 50
+    for i in range(len(onsets)):
+        initial_onset = onsets[i]
+        while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames):
+            onsets[i] -= 1
+
+#print(onsets/100)
+print(words_onsets/100)
+
+if reference_syls:
+    reference_onsets = [syl[0]+8 for line in reference_syls for syl in line[:-1]]
+
+fig, axs = plt.subplots(nrows=2, sharex=True)
+axs[0].imshow(log_spec.T, origin='lower', aspect='auto')
+if reference_syls:
+    axs[0].vlines(reference_onsets, 0, 140, colors='red')
+axs[0].plot((pitch_values/np.max(pitch_values))*140, color='yellow')
+axs[1].plot(mask_function)
+#axs[1].plot(cnn_smoothed)
+#axs[1].plot(spectral_function, color='green')
+axs[1].plot(activation_smoothed, color='orange')
+axs[1].vlines(onsets, 0, 2, colors='red')
+axs[1].vlines(words_onsets, 0, 3, colors='m')
+axs[1].hlines([max(mask_threshold, 0), activation_threshold], 0, onsets[-1]+100, colors='black')
+
+#bins = np.arange(0, 1, 0.02)
+#hist, hist_axs = plt.subplots(nrows=1)
+#hist_axs.hist(mask_function, bins=bins)
+
+plt.show()
\ No newline at end of file
-- 
GitLab