diff --git a/cnn_madmom/segment.py b/cnn_madmom/segment.py index a377c7276015b27288daae527e4923db722287ef..54257ebb63f611eccb3833a1a8bb5c321de38a66 100644 --- a/cnn_madmom/segment.py +++ b/cnn_madmom/segment.py @@ -5,6 +5,7 @@ import re import matplotlib.pyplot as plt from scipy.ndimage.filters import maximum_filter import scipy.signal as sg +import parselmouth def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500): @@ -139,7 +140,7 @@ if __name__ == "__main__": if(len(sys.argv) == 3): reference_syls = getSyls(sys.argv[2]) - print(reference_syls) + #print(reference_syls) backtrack = False @@ -147,6 +148,7 @@ if __name__ == "__main__": spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler') sig = madmom.audio.signal.Signal(songfile, num_channels=1) + parsel = parselmouth.Sound(sig) spec = madmom.audio.spectrogram.Spectrogram(sig) filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24) @@ -168,11 +170,22 @@ if __name__ == "__main__": onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=0.1, smooth=0) #onsets = np.array([o for o in onsets if cnn_smoothed[o] > 0.1]) + pitch = parsel.to_pitch() + pitch_values = pitch.selected_array['frequency'] + + pad_before = round(pitch.xs()[0]*100) + pad_after = len(magnitude) - len(pitch_values) - pad_before + + pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0)) + + mask_function = magnitude * pitch_values + mask_function = mask_function/np.max(mask_function) + mask_threshold = 0.15 + mask_window = [1,6] invalid_onsets_idx = [] - magnitude_window = [2,8] - magnitude_threshold = 1.2 + for i in range(len(onsets)): - if np.max(magnitude[onsets[i]+magnitude_window[0]:onsets[i]+magnitude_window[1]]) < magnitude_threshold: + if np.max(mask_function[onsets[i]+mask_window[0]:onsets[i]+mask_window[1]]) < mask_threshold: invalid_onsets_idx.append(i) onsets = np.delete(onsets, invalid_onsets_idx) @@ -207,15 +220,16 @@ if __name__ == "__main__": fig, axs = plt.subplots(nrows=2, sharex=True) axs[0].imshow(log_spec.T, origin='lower', aspect='auto') axs[0].vlines(reference_onsets, 0, 140, colors='red') - axs[1].plot(magnitude) + axs[0].plot((pitch_values/np.max(pitch_values))*140, color='yellow') + axs[1].plot(mask_function) #axs[1].plot(cnn_smoothed) #axs[1].plot(spectral_function, color='green') axs[1].plot(activation_smoothed, color='orange') axs[1].vlines(onsets, 0, 2, colors='red') - axs[1].hlines([max(magnitude_threshold, 0.5)], 0, onsets[-1]+100, colors='black') + axs[1].hlines([max(mask_threshold, 0)], 0, onsets[-1]+100, colors='black') - bins = np.arange(0, 2, 0.05) - hist, hist_axs = plt.subplots(nrows=1) - hist_axs.hist(magnitude, bins=bins) + #bins = np.arange(0, 1, 0.02) + #hist, hist_axs = plt.subplots(nrows=1) + #hist_axs.hist(mask_function, bins=bins) plt.show()