From f56ebd14bde28a5f339e9b7d5ca0f7ee83c2ac97 Mon Sep 17 00:00:00 2001 From: Sting <lallegre26@gmail.com> Date: Thu, 20 Jul 2023 23:08:28 +0200 Subject: [PATCH] Test stuff, use LyrAlign on a syllable basis --- autokara.py | 2 +- autosyl/LyricsAlignment/model.py | 2 +- autosyl/LyricsAlignment/wrapper.py | 20 +++-- assUtils.py => autosyl/assUtils.py | 3 +- autosyl/segment.py | 123 ++++++++++++++++++++++++++--- 5 files changed, 128 insertions(+), 22 deletions(-) rename assUtils.py => autosyl/assUtils.py (98%) diff --git a/autokara.py b/autokara.py index 9cbf49d..f4d3ba6 100644 --- a/autokara.py +++ b/autokara.py @@ -4,8 +4,8 @@ import demucs.separate import subprocess import shlex from pathlib import Path -from assUtils import AssWriter, getSyls +from autosyl.assUtils import AssWriter, getSyls from autosyl.segment import segment diff --git a/autosyl/LyricsAlignment/model.py b/autosyl/LyricsAlignment/model.py index 33669d6..f6fd66b 100644 --- a/autosyl/LyricsAlignment/model.py +++ b/autosyl/LyricsAlignment/model.py @@ -4,7 +4,7 @@ import torch.nn.functional as F import torchaudio import warnings -from utils import notes_to_pc +from autosyl.LyricsAlignment.utils import notes_to_pc # following FFT parameters are designed for a 22.5k sampling rate sr = 22050 diff --git a/autosyl/LyricsAlignment/wrapper.py b/autosyl/LyricsAlignment/wrapper.py index 1727822..120fcfe 100644 --- a/autosyl/LyricsAlignment/wrapper.py +++ b/autosyl/LyricsAlignment/wrapper.py @@ -5,8 +5,8 @@ import torch import torch.nn as nn import torch.nn.functional as F -import utils -from model import train_audio_transforms, AcousticModel, BoundaryDetection +import autosyl.LyricsAlignment.utils as utils +from autosyl.LyricsAlignment.model import train_audio_transforms, AcousticModel, BoundaryDetection np.random.seed(7) @@ -17,7 +17,7 @@ def preprocess_from_file(audio_file, lyrics_file, word_file=None): return y, words, lyrics_p, idx_word_p, idx_line_p -def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cuda=True): +def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cuda=True, checkpoint_folder="."): # start timer t = time() @@ -61,7 +61,7 @@ def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cud ).to(device) print("Loading acoustic model from checkpoint...") - state = utils.load_model(ac_model, "./checkpoints/checkpoint_{}".format(model_type), cuda=(device=="gpu")) + state = utils.load_model(ac_model, "{}/checkpoint_{}".format(checkpoint_folder, model_type), cuda=(device=="gpu")) ac_model.eval() print("Computing phoneme posteriorgram...") @@ -106,7 +106,7 @@ def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cud bdr_hparams['n_feats'], bdr_hparams['stride'], bdr_hparams['dropout'] ).to(device) print("Loading BDR model from checkpoint...") - state = utils.load_model(bdr_model, "./checkpoints/checkpoint_BDR", cuda=(device == "gpu")) + state = utils.load_model(bdr_model, "{}/checkpoint_BDR".format(checkpoint_folder), cuda=(device == "gpu")) bdr_model.eval() print("Computing boundary probability curve...") @@ -128,6 +128,9 @@ def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cud t = time() - t print("Alignment Score:\t{}\tTime:\t{}".format(score, t)) + resolution = 25600 / 22050 * 3 + word_align = [[round(word[0] * resolution), round(word[1] * resolution)] for word in word_align] + return word_align, words def preprocess_audio(audio_file, sr=22050): @@ -140,7 +143,7 @@ def preprocess_audio(audio_file, sr=22050): return y, curr_sr -def preprocess_lyrics(lyrics_file, word_file=None): +def preprocess_lyrics(lyrics_lines, word_file=None): from string import ascii_lowercase d = {ascii_lowercase[i]: i for i in range(26)} d["'"] = 26 @@ -148,8 +151,9 @@ def preprocess_lyrics(lyrics_file, word_file=None): d["~"] = 28 # process raw - with open(lyrics_file, 'r') as f: - raw_lines = f.read().splitlines() + #with open(lyrics_file, 'r') as f: + # raw_lines = f.read().splitlines() + raw_lines = lyrics_lines raw_lines = ["".join([c for c in line.lower() if c in d.keys()]).strip() for line in raw_lines] raw_lines = [" ".join(line.split()) for line in raw_lines if len(line) > 0] diff --git a/assUtils.py b/autosyl/assUtils.py similarity index 98% rename from assUtils.py rename to autosyl/assUtils.py index 7b5fd28..a9c4444 100644 --- a/assUtils.py +++ b/autosyl/assUtils.py @@ -33,7 +33,8 @@ def getSyls(ass_file): syl_line = [] lastTime = dateToTime(line[0]) for couple in RGX_TAGS.findall(line[2]): - syl_line.append([lastTime, couple[1], int(couple[0])]) + if(couple[1] != ''): + syl_line.append([lastTime, couple[1], int(couple[0])]) lastTime += int(couple[0]) syl_line.append([lastTime, '', 0]) SYLS.append(syl_line) diff --git a/autosyl/segment.py b/autosyl/segment.py index 5f4d785..8b63283 100644 --- a/autosyl/segment.py +++ b/autosyl/segment.py @@ -6,12 +6,47 @@ import matplotlib.pyplot as plt import scipy.signal as sg import parselmouth +from autosyl.assUtils import getSyls, timeToDate, dateToTime +from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file + + def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500): delay = -4 backtrack = False + + audio_file = songfile # pre-computed source-separated vocals; These models do not work with mixture input. + word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it + method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" + cuda=True # set True if you have access to a GPU + checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" + + pred_file = "./MTL.csv" # saved alignment results, "(float) start_time, (float) end_time, (string) word" + + + lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls] + #print(lyrics_lines) + + + # load audio and lyrics + # words: a list of words + # lyrics_p: phoneme sequence of the target lyrics + # idx_word_p: indices of word start in lyrics_p + # idx_line_p: indices of line start in lyrics_p + audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file) + + # compute alignment + # word_align: a list of frame indices aligned to each word + # words: a list of words + word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder) + + + words_onsets = np.array([word_align[i][0] for i in range(len(word_align))]) + + + cnn = madmom.features.onsets.CNNOnsetProcessor() spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler') @@ -91,6 +126,7 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500): while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames): onsets[i] -= 1 + onsets = words_onsets onsets = (onsets + delay)/100 #print(onsets) @@ -159,10 +195,42 @@ if __name__ == "__main__": else: reference_syls = None - #print(reference_syls) + print(reference_syls) backtrack = False + + + + audio_file = songfile # pre-computed source-separated vocals; These models do not work with mixture input. + word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it + method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" + cuda=True # set True if you have access to a GPU + checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" + + pred_file = "./MTL.csv" # saved alignment results, "(float) start_time, (float) end_time, (string) word" + + + lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls] + #print(lyrics_lines) + + + # load audio and lyrics + # words: a list of words + # lyrics_p: phoneme sequence of the target lyrics + # idx_word_p: indices of word start in lyrics_p + # idx_line_p: indices of line start in lyrics_p + audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file) + + # compute alignment + # word_align: a list of frame indices aligned to each word + # words: a list of words + word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder) + + + print([[word_align[i][0], word_align[i][1], words[i]] for i in range(len(word_align))]) + + cnn = madmom.features.onsets.CNNOnsetProcessor() spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler') @@ -218,18 +286,51 @@ if __name__ == "__main__": if reference_syls: filtered_onsets = [] - line_index = 0 + word_index = 0 + previous_onset = 0 for line in reference_syls: - line_index += 1 - syl_number = len(line) - 1 - line_onsets = [o for o in onsets if (o >= line[0][0] and o <= line[-1][0])] - line_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x])) - if syl_number > len(line_onsets): - print("WARNING : failed to detect enough onsets in line %d (%d, %d)" % (line_index, line[0][0], line[-1][0])) - filtered_onsets += line_onsets[0:syl_number] - + word_line = [] + word_tmp = "" + word_syl_count = 0 + for syl_index in range(len(line) - 1): + word_tmp += line[syl_index][1].strip() + word_syl_count += 1 + #print("%s : %s" % (word_tmp, words[word_index])) + if(word_tmp == words[word_index]): + print("Word %d recognized : %s" % (word_index, word_tmp)) + word_line.append([word_align[word_index][0], word_align[word_index][1], words[word_index], word_syl_count, 0]) + word_index += 1 + word_tmp = "" + word_syl_count = 0 + + num_words_done = 0 + while(num_words_done < len(word_line)): + for word in range(len(word_line)): + window_start = word[0] + + print(word_line) + onsets = np.array(sorted(filtered_onsets)) - + + """ + if word_index > 0: + word_start = max(word_align[word_index][0] - 5, line[0][0], previous_onset+1) + else: + word_start = line[0][0] + if word_index < len(words) - 1 and syl_index < len(line) - 2: + word_end = min(line[-1][0], word_align[word_index + 1][0] - 5) + else: + word_end = line[-1][0] + + word_onsets = [o for o in onsets if (o >= word_start and o <= word_end)] + word_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x])) + if word_syl_count > len(word_onsets): + print("WARNING : failed to detect enough onsets in word %s (%d, %d)" % (word_tmp, word_start, word_end)) + filtered_onsets += word_onsets[0:word_syl_count] + print(word_onsets[0:word_syl_count]) + previous_onset = max(word_onsets[0:word_syl_count] + [0]) + """ + # Backtrack onsets to closest earlier local minimum if backtrack: backtrack_max_frames = 50 -- GitLab