From 1d52574037098babbefd409b0237d9ba6cafee11 Mon Sep 17 00:00:00 2001
From: Sting <lallegre26@gmail.com>
Date: Sun, 26 Nov 2023 14:41:43 +0100
Subject: [PATCH] Do away with Bash scripts

---
 MANIFEST.in                   |  1 -
 README.md                     | 64 +++++++++++------------
 autokara/autokara.py          | 15 ++++--
 autokara/extractAss.sh        | 34 ------------
 autokara/extractWav.sh        | 97 -----------------------------------
 autokara/preprocess/audio.py  | 50 ++++++++++++++++++
 autokara/preprocess/lyrics.py | 18 +++++++
 autokara/preprocess_media.py  | 42 +++++++++++++++
 preprocess_media.sh           | 41 ---------------
 setup.py                      |  9 ++--
 10 files changed, 153 insertions(+), 218 deletions(-)
 delete mode 100755 autokara/extractAss.sh
 delete mode 100755 autokara/extractWav.sh
 create mode 100644 autokara/preprocess/audio.py
 create mode 100644 autokara/preprocess/lyrics.py
 create mode 100644 autokara/preprocess_media.py
 delete mode 100755 preprocess_media.sh

diff --git a/MANIFEST.in b/MANIFEST.in
index 7c6af3d..06bf2ec 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,3 @@
 include requirements.txt
 include README.md
-include preprocess_media.sh
 recursive-include autokara *
diff --git a/README.md b/README.md
index acc798d..de29e07 100644
--- a/README.md
+++ b/README.md
@@ -14,24 +14,22 @@ All other python modules can be installed directly through PIP, see next section
 
 ## Install
 
-### Linux
-
 Using a virtual environment is strongly recommended (but not mandatory if you know what you're doing) :
 ```bash
-$ python -m venv env     # create the virtual environment, do it once
-$ source env/bin/activate # use the virtual environement
+python -m venv env     # create the virtual environment, do it once
+source env/bin/activate # use the virtual environement
 
 # To exit the virtual environment
-$ deactivate              
+deactivate              
 ```
 
 The simplest way to install Autokara is through PIP.
 ```bash
 # Using HTTPS
-$ pip install git+https://git.iiens.net/bakaclub/autokara.git
+pip install git+https://git.iiens.net/bakaclub/autokara.git
 
 # Or SSH
-$ pip install git+ssh://git@git.iiens.net:bakaclub/autokara.git
+pip install git+ssh://git@git.iiens.net:bakaclub/autokara.git
 ```
 
 Or you can clone the repo and use `pip install <repo_directory>` if you prefer.
@@ -39,12 +37,9 @@ Or you can clone the repo and use `pip install <repo_directory>` if you prefer.
 
 To use the custom phonetic mappings for Japanese Romaji and other non-English languages, you need to update manually (for now) the g2p DB (within the venv):
 ```bash
-$ autokara-gen-lang
+autokara-gen-lang
 ```
 
-### Windows
-
-Still working on that...
 
 ## Configuration
 
@@ -61,26 +56,26 @@ This new file has priority over the default one, which is used only as fallback.
 
 To use Autokara, you need :
  - A media file of the song (video, or pre-extracted vocals)
- - An ASS file with the lyrics, split by syllable
+ - An ASS file with the lyrics, split by syllable (you can use the [Auto-Split](https://docs.karaokes.moe/aegisub/auto-split.lua) in Aegisub, but doing it manually may yield better results)
 
 To execute AutoKara on a MKV video file and an ASS file containing the lyrics (ASS will be overwritten):
 ```bash
-$ autokara video.mkv lyrics.ass
+autokara video.mkv lyrics.ass
 ```
 
 To output to a different file (and keep the original) :
 ```bash
-$ autokara video.mkv lyrics.ass -o output.ass
+autokara video.mkv lyrics.ass -o output.ass
 ```
 
 To execute AutoKara on a (pre-extracted) WAV (or OGG, MP3, ...) vocals file, pass the `--vocals` flag :
 ```bash
-$ autokara vocals.wav output.ass --vocals
+autokara vocals.wav output.ass --vocals
 ```
 
 To use a phonetic transcription optimized for a specific language, use `--lang` (or `-l`) :
 ```bash
-$ autokara vocals.wav output.ass --lang jp
+autokara vocals.wav output.ass --lang jp
 ```
 
 Available languages options are :
@@ -94,36 +89,35 @@ da : Danish
 
 Full help for all options is available with :
 ```bash
-$ autokara -h
+autokara -h
 ```
 
 ## Useful scripts
 
-To only extract .wav audio from a MKV file :
-```bash
-$ ./extractWav.sh source_video output_audio
-```
+### Manual preprocessing
 
-To only extract .ass sub file from a MKV file :
-```bash
-$ ./extractAss.sh source_video output_subs
-```
+Use `autokara-preprocess` if you want to manually preprocess video/lyrics in advance :
 
-To only separate vocals from instruments in an audio file :
 ```bash
-demucs --two-stems=vocals -o output_folder audio_file.wav
-```
+# Extract vocals from video :
+autokara-preprocess --vocals video_file output_folder/ 
 
-Batch preprocessing (vocals + ASS extraction) of all videos in a directory :
-```bash
-$ ./preprocess_media.sh video_folder output_folder
+# Extract ASS file from a MKV containing a subtitle track :
+autokara-preprocess --lyrics video_file output_file.ass
+
+# Do both at once :
+autokara-preprocess --full video_file output_folder/
 ```
 
-A visualization tool, mainly intended for debug.
-Does the same as autokara.py, but instead of writing to a file, plots a graphic with onset times, spectrogram, probability curves,... 
-Does not work on video files, only separated vocals audio files
+Then you can use Autokara on the extracted files with the `--vocals` flag.
+
+### Sound and onsets plotting
+
+A visualization tool, mainly intended for debug or curious people.
+Does the same as `autokara`, but instead of writing to a file, plots a graphic with syllable onset times, spectrogram, probability curves,... 
+Does not work on video files, only separated vocals audio files :
 ```bash
-$ autokara-plot vocals.wav lyrics.ass
+autokara-plot vocals.wav lyrics.ass
 ```
 
 
diff --git a/autokara/autokara.py b/autokara/autokara.py
index d981211..21f8d21 100644
--- a/autokara/autokara.py
+++ b/autokara/autokara.py
@@ -3,11 +3,14 @@ import argparse
 import demucs.separate
 import subprocess
 import shlex
+import shutil
 from pathlib import Path
 from configparser import ConfigParser
 
 from .autosyl.assUtils import AssWriter, getSyls, getHeader
 from .autosyl.segment import segment
+from .preprocess.audio import *
+from .preprocess.lyrics import *
 
 
 
@@ -25,6 +28,7 @@ def main(opts=None):
     args = parser.parse_args(opts)
 
     ass_file = args.ass_file
+    source_file = args.source_file
     verbose = args.verbose
 
     here = Path(__file__).parent
@@ -54,16 +58,14 @@ def main(opts=None):
         basename = Path(args.source_file).stem
         audio_file = f"{media_dir:s}/audio/{basename:s}.wav"
 
-        subprocess.call(shlex.split(f'{str(here)}/extractWav.sh "{args.source_file:s}" "{audio_file}"'))
+        extract_audio(source_file, output_file=audio_file)
 
         Path(f"{media_dir:s}/vocals").mkdir(parents=True, exist_ok=True)
         output_folder = f"{media_dir:s}/vocals"
 
         print("Isolating vocals...")
 
-        # Not working, don't know why
-        # demucs.separate.main(shlex.split('--two-stems vocals -o "%s" "%s"' % (output_folder, audio_file)))
-        subprocess.call(shlex.split(f'demucs --two-stems vocals -o "{output_folder:s}" "{audio_file:s}"'))
+        extract_vocals(audio_file, output_folder)
 
         vocals_file = f"{media_dir:s}/vocals/htdemucs/{basename:s}/vocals.wav"
     else:
@@ -92,6 +94,11 @@ def main(opts=None):
     writer.writeSyls(syls, line_meta)
     writer.closeAss()
 
+    # clean up
+    if not args.vocals:
+        shutil.rmtree(f'{media_dir:s}/vocals/htdemucs/{basename:s}')
+        Path(audio_file).unlink(missing_ok=True)
+
 
 if __name__ == "__main__":
     main()
diff --git a/autokara/extractAss.sh b/autokara/extractAss.sh
deleted file mode 100755
index cf5dcdd..0000000
--- a/autokara/extractAss.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-##########################################################################################################
-#
-# COMMAND : extractAss.sh
-#
-# AUTHOR : Kubat
-#
-# DESCRIPTION : CLI tool to extract subtitles from .mkv files
-#
-# USE : ./extractAss.sh fileInput.mkv fileOutput.ass
-#
-# REQUIREMENTS : Have FFMPEG and SoX installed (for audio/video decoding)
-#
-#
-##########################################################################################################
-
-USAGE_MESSAGE="usage : $0 fileInput.mkv fileOutput.ass"
-if [ $# != 2 ]
-then
-  echo $USAGE_MESSAGE
-  exit 1
-fi
-
-if ! [[ "$1" =~ .mkv$ ]] || ! [[ "$2" =~ .ass$ ]]
-then
-  echo $USAGE_MESSAGE
-  exit 1
-fi
-
-# get the subtitles track id
-ID=$(mkvmerge --identify "$1" | sed -n 's/Track ID \([[:digit:]]*\).*subtitles.*/\1/p')
-
-mkvextract tracks "$1" "$ID":"$2"
diff --git a/autokara/extractWav.sh b/autokara/extractWav.sh
deleted file mode 100755
index cee7e4c..0000000
--- a/autokara/extractWav.sh
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/bin/bash
-
-
-##########################################################################################################
-#
-# COMMAND : extractWav.sh
-#
-# AUTHOR : Sting
-#
-# DESCRIPTION : CLI tool to extract audio from .mkv files and convert it to 1-channel WAV files
-#               Currently supported formats :
-#			- Video : .mkv files only, any codec supported by FFMPEG
-#			- Audio : AAC, FLAC, DTS, AC3, MP3 (MPEG), OPUS, VORBIS
-#
-# USE : ./extractWav.sh source_folder destination_folder
-#
-# REQUIREMENTS : Have FFMPEG and SoX installed (for audio/video decoding)
-#
-#
-##########################################################################################################
-
-
-
-USAGE_MESSAGE="usage : $0 source_file dest_file"
-if [ $# != 2 ]; then
-        echo $USAGE_MESSAGE; exit 1;
-fi
-
-
-filename=$1
-dest_file=$2
-
-echo $filename
-echo $dest_file
-
-[ -e "$filename" ] || continue
-name=${filename##*/}
-base=${name%.mkv}
-
-codecLine=$(mkvinfo "$filename" | grep " A_")
-regex=".*A_([A-Z0-9]+).*"
-
-[[ $codecLine =~ $regex ]]
-
-codec=${BASH_REMATCH[1]}
-
-case $codec in
-
-	"AAC")
-			extension="m4a"
-			;;
-
-	"FLAC")
-			extension="flac"
-			;;
-
-	"VORBIS")
-			extension="ogg"
-			;;
-
-	"MPEG")
-		extension="mp3"
-			;;
-
-	"AC3")
-		extension="ac3"
-		;;
-
-	"EAC3")
-		extension="eac3"
-		;;
-
-	"DTS")
-		extension="dts"
-		;;
-
-	"OPUS")
-		extension="opus"
-		;;
-
-	*)
-		extension=""
-		;;
-
-esac
-
-
-
-ffmpeg -i "$filename" -acodec copy -vn "$base.$extension" && \
-ffmpeg -i "$base.$extension" "$base.wav" && \
-#sox "$2/$base.stereo.wav" "$2/$base.wav" remix - && \
-#rm "$2/$base.stereo.wav" && \
-rm "$base.$extension"
-mv "$base.wav" "$2"
-
-
-
diff --git a/autokara/preprocess/audio.py b/autokara/preprocess/audio.py
new file mode 100644
index 0000000..a28e493
--- /dev/null
+++ b/autokara/preprocess/audio.py
@@ -0,0 +1,50 @@
+import sys
+import argparse
+import demucs.separate
+import subprocess
+import shlex
+from pathlib import Path
+import shutil
+
+
+
+
+
+def extract_audio(source_file, output_file=None):
+    if not output_file:
+        out_path = Path(source_file).with_suffix(".wav")
+    else:
+        out_path = output_file
+    
+    subprocess.call(shlex.split(f'ffmpeg -i "{source_file:s}" -vn "{out_path:s}"'))
+    
+    return str(out_path)
+
+
+
+def extract_vocals(source_file, output_folder):
+    subprocess.call(shlex.split(f'demucs --two-stems vocals -o "{output_folder:s}" "{source_file:s}"'))
+
+
+
+
+
+def preprocess_video(source_file, output_folder=None, media_dir="."):
+    Path(media_dir + "/audio").mkdir(parents=True, exist_ok=True)
+    basename = Path(source_file).stem
+    audio_file = f"{media_dir:s}/audio/{basename:s}.wav"
+
+    print("Extracting audio from video file...")
+    extract_audio(source_file, audio_file)
+
+    if not output_folder:
+        Path(f"{media_dir:s}/vocals").mkdir(parents=True, exist_ok=True)
+        output_folder = f"{media_dir:s}/vocals"
+
+    print("Isolating vocals...")
+    extract_vocals(audio_file, output_folder)
+    
+    subprocess.call(shlex.split(f'ffmpeg -i "{output_folder:s}/htdemucs/{basename:s}/vocals.wav" "{output_folder:s}/vocals.ogg"'))
+    shutil.rmtree(f'{output_folder:s}/htdemucs/{basename:s}')
+
+    Path(audio_file).unlink(missing_ok=True)
diff --git a/autokara/preprocess/lyrics.py b/autokara/preprocess/lyrics.py
new file mode 100644
index 0000000..5e286aa
--- /dev/null
+++ b/autokara/preprocess/lyrics.py
@@ -0,0 +1,18 @@
+import sys
+import argparse
+import subprocess
+import shlex
+from pathlib import Path
+
+
+
+def extract_subtitles(source_file, output_file=None):
+    if not output_file:
+        out_path = Path(source_file).with_suffix(".ass")
+    else:
+        out_path = output_file
+    
+    data = subprocess.run(f'mkvmerge --identify "{source_file:s}" | sed -n "s/Track ID \\([[:digit:]]*\\).*subtitles.*/\\1/p"', capture_output=True, shell=True, text=True)
+    track_id = data.stdout.rstrip()
+    
+    subprocess.call(shlex.split(f'mkvextract "{source_file:s}" tracks "{track_id:s}":"{out_path:s}"'))
\ No newline at end of file
diff --git a/autokara/preprocess_media.py b/autokara/preprocess_media.py
new file mode 100644
index 0000000..196c0b0
--- /dev/null
+++ b/autokara/preprocess_media.py
@@ -0,0 +1,42 @@
+from .preprocess.audio import *
+from .preprocess.lyrics import *
+import sys
+import argparse
+from configparser import ConfigParser
+
+
+def main(opts=None):
+    parser = argparse.ArgumentParser(description='Script to prepare media for Autokara - extract vocals and lyrics from video')
+    parser.add_argument("--vocals", action="store_true", help="Perform vocals extraction on source file")
+    parser.add_argument("--lyrics", action="store_true", help="Perform ASS extraction on source file, if it has a subtitle track")
+    parser.add_argument("--full", action="store_true", help="Extract both vocals and lyrics")
+    parser.add_argument("source_file", type=str, help="The video/audio file to preprocess")
+    parser.add_argument("output_file", type=str, help="If extracting lyrics, the ASS output file. If extracting vocals or both, the output folder for separated tracks")
+
+    args = parser.parse_args(opts)
+
+    here = Path(__file__).parent
+
+    config = ConfigParser()
+    config.read([
+        str(here / "default.conf"),                                             # Default config file
+        str(Path().home()/ ".config" / "autokara"/ "autokara.conf")             # User config file
+    ])
+
+    media_dir = config['Media']['media_dir']
+
+    source_file = args.source_file
+    output_file = args.output_file
+
+    if args.full or (args.vocals and args.lyrics) or (not args.full and not args.vocals and not args.lyrics):
+        preprocess_video(source_file, output_folder=output_file, media_dir=media_dir)
+        extract_subtitles(source_file, output_file=f'{output_file:s}/vocals.ass')
+    elif args.vocals:
+        preprocess_video(source_file, output_folder=output_file, media_dir=media_dir)
+    elif args.lyrics:
+        extract_subtitles(source_file, output_file=output_file)
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/preprocess_media.sh b/preprocess_media.sh
deleted file mode 100755
index cc0f1cd..0000000
--- a/preprocess_media.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-
-##########################################################################################################
-#
-# COMMAND : preprocess_media.sh
-#
-# AUTHOR : Sting
-#
-# DESCRIPTION : CLI tool to batch extract ASS lyrics and vocals from a video folder
-#
-# USE : ./preprocess_media.sh input_folder output_folder
-#
-# REQUIREMENTS : FFMPEG, Demucs, extractAss and extractWav
-#
-#
-##########################################################################################################
-
-
-USAGE_MESSAGE="usage : $0 video_folder train_folder"
-if [ $# != 2 ]; then
-        echo $USAGE_MESSAGE; exit 1;
-fi
-
-
-video_folder=$1
-train_folder=$2
-
-for filename in "$video_folder"/*.mkv; do
-    name=${filename##*/}
-    base=${name%.mkv}
-    mkdir -p "$train_folder/$base"
-
-    extractWav.sh "$filename" "$train_folder/$base/$base.wav"
-    demucs --two-stems vocals -o "$train_folder/$base" "$train_folder/$base/$base.wav"
-    rm "$train_folder/$base/$base.wav"
-    ffmpeg -i "$train_folder/$base/htdemucs/$base/vocals.wav" "$train_folder/$base/vocals.ogg"
-    rm -r "$train_folder/$base/htdemucs"
-
-    extractAss.sh "$filename" "$train_folder/$base/vocals.ass"
-done;
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c71a27b..a72de99 100644
--- a/setup.py
+++ b/setup.py
@@ -50,14 +50,11 @@ setup(
     entry_points={
         'console_scripts': ['autokara=autokara.autokara:main',
                             'autokara-plot=autokara.plot_syls:main',
-                            'autokara-gen-lang=autokara.update_lang_db:main'
+                            'autokara-gen-lang=autokara.update_lang_db:main',
+                            'autokara-preprocess=autokara.preprocess_media:main'
                             ],
     },
-    scripts=[
-        'autokara/extractAss.sh',
-        'autokara/extractWav.sh',
-        'preprocess_media.sh'    
-    ],
+    scripts=[],
     license='MIT License',
     classifiers=[
         # Trove classifiers
-- 
GitLab