From c3d5164c181d0ba0368df7514d1824dedc5d8fcf Mon Sep 17 00:00:00 2001 From: Sting <loic.allegre@ensiie.fr> Date: Fri, 21 Jul 2023 11:08:24 +0200 Subject: [PATCH] Refactor --- autokara.py | 6 +- autosyl/segment.py | 297 ++------------------------------------------- plot_syls.py | 184 ++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+), 291 deletions(-) create mode 100644 plot_syls.py diff --git a/autokara.py b/autokara.py index f4d3ba6..1da2d81 100644 --- a/autokara.py +++ b/autokara.py @@ -41,12 +41,12 @@ else: vocals_file = args.source_file if args.ref: - reference_syls = getSyls(args.ref) + reference_file = args.ref else: - reference_syls = None + reference_file = None print("Identifying syl starts...") -syls = segment(vocals_file, reference_syls=reference_syls) +syls = segment(vocals_file, reference_file=reference_file) print(syls) print("Syls found, writing ASS file...") diff --git a/autosyl/segment.py b/autosyl/segment.py index 649b1f1..b237350 100644 --- a/autosyl/segment.py +++ b/autosyl/segment.py @@ -11,11 +11,17 @@ from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file -def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500): +def segment(songfile, reference_file=None, syls_per_line=10, last_syl_dur=500): delay = -4 backtrack = False + if reference_file: + reference_syls = getSyls(reference_file) + else: + reference_syls = [] + + print(reference_syls) audio_file = songfile # pre-computed source-separated vocals; These models do not work with mixture input. word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it @@ -46,87 +52,6 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500): words_onsets = np.array([word_align[i][0] for i in range(len(word_align))]) print(words_onsets) - ''' - cnn = madmom.features.onsets.CNNOnsetProcessor() - spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler') - - sig = madmom.audio.signal.Signal(songfile, num_channels=1) - parsel = parselmouth.Sound(sig) - - spec = madmom.audio.spectrogram.Spectrogram(sig) - filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24) - log_spec = madmom.audio.spectrogram.LogarithmicSpectrogram(filt_spec, add=1) - - magnitude = np.max(log_spec[:,:100], axis=1) - - cnn_function = cnn(sig) - spectral_function = spectral(sig) - spectral_function = spectral_function/(spectral_function.max()) - - #activation_function = 0.5*cnn_function + 0.5*spectral_function - activation_function = (2 * cnn_function * spectral_function)/(cnn_function + spectral_function) - #activation_function = np.where(spectral_function > 0.14, cnn_function, 0) - #onsets = proc(activation_function) - - if reference_syls: - activation_threshold = 0.1 - else: - activation_threshold = 0.2 - - - activation_smoothed = madmom.audio.signal.smooth(activation_function, 20) - cnn_smoothed = madmom.audio.signal.smooth(cnn_function, 20) - onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=activation_threshold, smooth=0) - #onsets = np.array([o for o in onsets if cnn_smoothed[o] > activation_threshold]) - - pitch = parsel.to_pitch() - pitch_values = pitch.selected_array['frequency'] - - pad_before = round(pitch.xs()[0]*100) - pad_after = len(magnitude) - len(pitch_values) - pad_before - - pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0)) - - mask_function = magnitude * pitch_values - mask_function = mask_function/np.max(mask_function) - mask_threshold = 0.15 - mask_window = [1,6] - invalid_onsets_idx = [] - - for i in range(len(onsets)): - if np.max(mask_function[onsets[i]+mask_window[0]:onsets[i]+mask_window[1]]) < mask_threshold: - invalid_onsets_idx.append(i) - - onsets = np.delete(onsets, invalid_onsets_idx) - - - if reference_syls: - filtered_onsets = [] - line_index = 0 - for line in reference_syls: - line_index += 1 - syl_number = len(line) - 1 - line_onsets = [o for o in onsets if (o >= line[0][0] and o <= line[-1][0])] - line_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x])) - missing_syls = 0 - if syl_number > len(line_onsets): - print("WARNING : failed to detect enough onsets in line %d (%d, %d)" % (line_index, line[0][0], line[-1][0])) - missing_syls = syl_number - len(line_onsets) - filtered_onsets += line_onsets[0:syl_number] - filtered_onsets += [line[-1][0] for i in range(missing_syls)] # If missing some syllables, pad with 0-length syls - - onsets = np.array(sorted(filtered_onsets)) - - - if backtrack: - # Backtrack onsets to closest earlier local minimum - backtrack_max_frames = 50 - for i in range(len(onsets)): - initial_onset = onsets[i] - while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames): - onsets[i] -= 1 - ''' - onsets = words_onsets onsets = (onsets + delay)/100 #print(onsets) @@ -136,8 +61,8 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500): onset_index = 0 for line in reference_syls: #print(onset_index, " : ", line) - l = [[onsets[onset_index + i], words[onset_index + i]] for i in range(len(line)-1)] - l.append([line[-1][0]/100, '']) + l = [[onsets[onset_index + i], line[i][1]] for i in range(len(line)-1)] + l.append([word_align[onset_index + (len(line) - 2)][1]/100, '']) syls.append(l) onset_index += (len(line) - 1) else: @@ -158,207 +83,3 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500): return syls - -if __name__ == "__main__": - - def dateToTime(date): - """ - The `date` should be in the following format: H:MM:SS.cs - """ - hourInMinuts = int(date[0:1]) * 60 - minutsInSeconds = (int(date[2:4]) + hourInMinuts) * 60 - secondsInCentiseconds = (int(date[5:7]) + minutsInSeconds) * 100 - return int(date[8:10]) + secondsInCentiseconds - - - def getSyls(ass_file): - SYLS = [] - with open(ass_file, 'r') as f: - CONTENT = f.read() - LINES_KARA = re.compile(r"Comment:.*(\d+:\d{2}:\d{2}.\d{2}),(\d+:\d{2}:\d{2}.\d{2}),.*,karaoke,(.*)\n"); - RGX_TAGS = re.compile(r"\{\\k(\d+)\}([^\{\n\r]*)") - for line in LINES_KARA.findall(CONTENT): - syl_line = [] - lastTime = dateToTime(line[0]) - for couple in RGX_TAGS.findall(line[2]): - syl_line.append((lastTime, couple[1], int(couple[0]))) - lastTime += int(couple[0]) - syl_line.append([lastTime, '', 0]) - SYLS.append(syl_line) - return SYLS - - - - - songfile = sys.argv[1] - if(len(sys.argv) >= 3): - reference_syls = getSyls(sys.argv[2]) - else: - reference_syls = None - - print(reference_syls) - - backtrack = False - - - - - audio_file = songfile # pre-computed source-separated vocals; These models do not work with mixture input. - word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it - method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" - cuda=True # set True if you have access to a GPU - checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" - - pred_file = "./MTL.csv" # saved alignment results, "(float) start_time, (float) end_time, (string) word" - - - lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls] - #print(lyrics_lines) - - - # load audio and lyrics - # words: a list of words - # lyrics_p: phoneme sequence of the target lyrics - # idx_word_p: indices of word start in lyrics_p - # idx_line_p: indices of line start in lyrics_p - audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file) - - # compute alignment - # word_align: a list of frame indices aligned to each word - # words: a list of words - word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder) - - - print([[word_align[i][0], word_align[i][1], words[i]] for i in range(len(word_align))]) - - - cnn = madmom.features.onsets.CNNOnsetProcessor() - spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler') - - sig = madmom.audio.signal.Signal(songfile, num_channels=1) - parsel = parselmouth.Sound(sig) - - spec = madmom.audio.spectrogram.Spectrogram(sig) - filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24) - log_spec = madmom.audio.spectrogram.LogarithmicSpectrogram(filt_spec, add=1) - - magnitude = np.max(log_spec[:,:100], axis=1) - - cnn_function = cnn(sig) - spectral_function = spectral(sig) - spectral_function = spectral_function/(spectral_function.max()) - - #activation_function = 0.5*cnn_function + 0.5*spectral_function - activation_function = (2 * cnn_function * spectral_function)/(cnn_function + spectral_function) - #activation_function = np.where(spectral_function > 0.14, cnn_function, 0) - #onsets = proc(activation_function) - - - if reference_syls: - activation_threshold = 0.1 - else: - activation_threshold = 0.2 - - activation_smoothed = madmom.audio.signal.smooth(activation_function, 20) - cnn_smoothed = madmom.audio.signal.smooth(cnn_function, 20) - onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=activation_threshold, smooth=0) - #onsets = np.array([o for o in onsets if cnn_smoothed[o] > 0.1]) - - pitch = parsel.to_pitch() - pitch_values = pitch.selected_array['frequency'] - - pad_before = round(pitch.xs()[0]*100) - pad_after = len(magnitude) - len(pitch_values) - pad_before - - pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0)) - - mask_function = magnitude * pitch_values - mask_function = mask_function/np.max(mask_function) - mask_threshold = 0.15 - mask_window = [1,6] - invalid_onsets_idx = [] - - for i in range(len(onsets)): - if np.max(mask_function[onsets[i]+mask_window[0]:onsets[i]+mask_window[1]]) < mask_threshold: - invalid_onsets_idx.append(i) - - onsets = np.delete(onsets, invalid_onsets_idx) - - - if reference_syls: - filtered_onsets = [] - word_index = 0 - previous_onset = 0 - for line in reference_syls: - word_line = [] - word_tmp = "" - word_syl_count = 0 - for syl_index in range(len(line) - 1): - word_tmp += line[syl_index][1].strip() - word_syl_count += 1 - #print("%s : %s" % (word_tmp, words[word_index])) - if(word_tmp == words[word_index]): - print("Word %d recognized : %s" % (word_index, word_tmp)) - word_line.append([word_align[word_index][0], word_align[word_index][1], words[word_index], word_syl_count, 0]) - word_index += 1 - word_tmp = "" - word_syl_count = 0 - - num_words_done = 0 - while(num_words_done < len(word_line)): - for word in range(len(word_line)): - window_start = word[0] - - print(word_line) - - onsets = np.array(sorted(filtered_onsets)) - - """ - if word_index > 0: - word_start = max(word_align[word_index][0] - 5, line[0][0], previous_onset+1) - else: - word_start = line[0][0] - if word_index < len(words) - 1 and syl_index < len(line) - 2: - word_end = min(line[-1][0], word_align[word_index + 1][0] - 5) - else: - word_end = line[-1][0] - - word_onsets = [o for o in onsets if (o >= word_start and o <= word_end)] - word_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x])) - if word_syl_count > len(word_onsets): - print("WARNING : failed to detect enough onsets in word %s (%d, %d)" % (word_tmp, word_start, word_end)) - filtered_onsets += word_onsets[0:word_syl_count] - print(word_onsets[0:word_syl_count]) - previous_onset = max(word_onsets[0:word_syl_count] + [0]) - """ - - # Backtrack onsets to closest earlier local minimum - if backtrack: - backtrack_max_frames = 50 - for i in range(len(onsets)): - initial_onset = onsets[i] - while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames): - onsets[i] -= 1 - - print(onsets/100) - - if reference_syls: - reference_onsets = [syl[0]+8 for line in reference_syls for syl in line[:-1]] - - fig, axs = plt.subplots(nrows=2, sharex=True) - axs[0].imshow(log_spec.T, origin='lower', aspect='auto') - if reference_syls: - axs[0].vlines(reference_onsets, 0, 140, colors='red') - axs[0].plot((pitch_values/np.max(pitch_values))*140, color='yellow') - axs[1].plot(mask_function) - #axs[1].plot(cnn_smoothed) - #axs[1].plot(spectral_function, color='green') - axs[1].plot(activation_smoothed, color='orange') - axs[1].vlines(onsets, 0, 2, colors='red') - axs[1].hlines([max(mask_threshold, 0), activation_threshold], 0, onsets[-1]+100, colors='black') - - #bins = np.arange(0, 1, 0.02) - #hist, hist_axs = plt.subplots(nrows=1) - #hist_axs.hist(mask_function, bins=bins) - - plt.show() diff --git a/plot_syls.py b/plot_syls.py new file mode 100644 index 0000000..527c1e4 --- /dev/null +++ b/plot_syls.py @@ -0,0 +1,184 @@ +import madmom +import numpy as np +import sys +import re +import matplotlib.pyplot as plt +import scipy.signal as sg +import parselmouth + +from autosyl.assUtils import getSyls, timeToDate, dateToTime +from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file + + +############################################################################## +# +# This is a test script to visualize extracted onsets and other audio features +# It is mainly intended for development/debug +# +# If you just want to detect the syllables, use autokara.py instead +# +############################################################################## + +songfile = sys.argv[1] +if(len(sys.argv) >= 3): + reference_syls = getSyls(sys.argv[2]) +else: + reference_syls = None + +print(reference_syls) + +backtrack = False + + + + +audio_file = songfile # pre-computed source-separated vocals; These models do not work with mixture input. +word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it +method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" +cuda=True # set True if you have access to a GPU +checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" + +pred_file = "./MTL.csv" # saved alignment results, "(float) start_time, (float) end_time, (string) word" + + +lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls] +#print(lyrics_lines) + + +# load audio and lyrics +# words: a list of words +# lyrics_p: phoneme sequence of the target lyrics +# idx_word_p: indices of word start in lyrics_p +# idx_line_p: indices of line start in lyrics_p +audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file) + +# compute alignment +# word_align: a list of frame indices aligned to each word +# words: a list of words +word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder) + + +print([[word_align[i][0], word_align[i][1], words[i]] for i in range(len(word_align))]) +words_onsets = np.array([word_align[i][0] for i in range(len(word_align))]) + + +cnn = madmom.features.onsets.CNNOnsetProcessor() +spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler') + +sig = madmom.audio.signal.Signal(songfile, num_channels=1) +parsel = parselmouth.Sound(sig) + +spec = madmom.audio.spectrogram.Spectrogram(sig) +filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24) +log_spec = madmom.audio.spectrogram.LogarithmicSpectrogram(filt_spec, add=1) + +magnitude = np.max(log_spec[:,:100], axis=1) + +cnn_function = cnn(sig) +spectral_function = spectral(sig) +spectral_function = spectral_function/(spectral_function.max()) + +#activation_function = 0.5*cnn_function + 0.5*spectral_function +activation_function = (2 * cnn_function * spectral_function)/(cnn_function + spectral_function) +#activation_function = np.where(spectral_function > 0.14, cnn_function, 0) +#onsets = proc(activation_function) + + +if reference_syls: + activation_threshold = 0.1 +else: + activation_threshold = 0.2 + +activation_smoothed = madmom.audio.signal.smooth(activation_function, 20) +cnn_smoothed = madmom.audio.signal.smooth(cnn_function, 20) +onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=activation_threshold, smooth=0) +#onsets = np.array([o for o in onsets if cnn_smoothed[o] > 0.1]) + +pitch = parsel.to_pitch() +pitch_values = pitch.selected_array['frequency'] + +pad_before = round(pitch.xs()[0]*100) +pad_after = len(magnitude) - len(pitch_values) - pad_before + +pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0)) + +mask_function = magnitude * pitch_values +mask_function = mask_function/np.max(mask_function) +mask_threshold = 0.15 +mask_window = [1,6] +invalid_onsets_idx = [] + +for i in range(len(onsets)): + if np.max(mask_function[onsets[i]+mask_window[0]:onsets[i]+mask_window[1]]) < mask_threshold: + invalid_onsets_idx.append(i) + +onsets = np.delete(onsets, invalid_onsets_idx) + + + +if reference_syls: + filtered_onsets = [] + line_index = 0 + for line in reference_syls: + line_index += 1 + syl_number = len(line) - 1 + line_onsets = [o for o in onsets if (o >= line[0][0] and o <= line[-1][0])] + line_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x])) + if syl_number > len(line_onsets): + print("WARNING : failed to detect enough onsets in line %d (%d, %d)" % (line_index, line[0][0], line[-1][0])) + filtered_onsets += line_onsets[0:syl_number] + + onsets = np.array(sorted(filtered_onsets)) + + +""" + if word_index > 0: + word_start = max(word_align[word_index][0] - 5, line[0][0], previous_onset+1) + else: + word_start = line[0][0] + if word_index < len(words) - 1 and syl_index < len(line) - 2: + word_end = min(line[-1][0], word_align[word_index + 1][0] - 5) + else: + word_end = line[-1][0] + + word_onsets = [o for o in onsets if (o >= word_start and o <= word_end)] + word_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x])) + if word_syl_count > len(word_onsets): + print("WARNING : failed to detect enough onsets in word %s (%d, %d)" % (word_tmp, word_start, word_end)) + filtered_onsets += word_onsets[0:word_syl_count] + print(word_onsets[0:word_syl_count]) + previous_onset = max(word_onsets[0:word_syl_count] + [0]) +""" + +# Backtrack onsets to closest earlier local minimum +if backtrack: + backtrack_max_frames = 50 + for i in range(len(onsets)): + initial_onset = onsets[i] + while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames): + onsets[i] -= 1 + +#print(onsets/100) +print(words_onsets/100) + +if reference_syls: + reference_onsets = [syl[0]+8 for line in reference_syls for syl in line[:-1]] + +fig, axs = plt.subplots(nrows=2, sharex=True) +axs[0].imshow(log_spec.T, origin='lower', aspect='auto') +if reference_syls: + axs[0].vlines(reference_onsets, 0, 140, colors='red') +axs[0].plot((pitch_values/np.max(pitch_values))*140, color='yellow') +axs[1].plot(mask_function) +#axs[1].plot(cnn_smoothed) +#axs[1].plot(spectral_function, color='green') +axs[1].plot(activation_smoothed, color='orange') +axs[1].vlines(onsets, 0, 2, colors='red') +axs[1].vlines(words_onsets, 0, 3, colors='m') +axs[1].hlines([max(mask_threshold, 0), activation_threshold], 0, onsets[-1]+100, colors='black') + +#bins = np.arange(0, 1, 0.02) +#hist, hist_axs = plt.subplots(nrows=1) +#hist_axs.hist(mask_function, bins=bins) + +plt.show() \ No newline at end of file -- GitLab