I managed to make a Python script to do this, here's how it works:
-
It uses FFmpeg to take a 5 second bit of the audio of each video, and tries to make them more similar (mono, same sample rate, lowpass filter and the dynaudnorm filter)
-
It uses SoundFile to load the audio files, then SciPy's cross-correlation function scipy.signal.correlate and NumPy's argmax to get the time delay between the two audio files
-
It uses FFmpeg again to mux the audio and video together and uses itsoffset offset the audio
The script is used by putting the files with the best quality video in a folder named "video" and the files with the best quality audio in a folder named "audio", and each pair of files has the same filename (the files extensions can be different though), and then the combined files will be made in a folder named "output".
I'm sure this script isn't perfect and could be improved, but it seems to work alright with what I've tested it on.
Thanks u/nedragrevev for the inspiration!
import os
import subprocess
import numpy as np
import soundfile as sf
from scipy.signal import correlate
audio_no_exts = {}
for fn in os.listdir('audio'):
no_ext = os.path.splitext(fn)[0]
audio_no_exts[no_ext] = fn
pairs = []
for fn in os.listdir('video'):
no_ext = os.path.splitext(fn)[0]
if no_ext in audio_no_exts:
pairs.append({
'audio': os.path.join('audio', audio_no_exts[no_ext]),
'video': os.path.join('video', fn),
'output': os.path.join('output', no_ext+'.mp4'),
'no_ext': no_ext
})
os.makedirs('temp', exist_ok=True)
os.makedirs('output', exist_ok=True)
for p in pairs:
if os.path.isfile(p['output']):
continue
print(p['no_ext'])
wav_video_path = os.path.join('temp', p['no_ext']+'_video.wav')
wav_audio_path = os.path.join('temp', p['no_ext']+'_audio.wav')
def cmd(inp, out):
return ['ffmpeg', '-y', '-i', inp, '-ss', '10', '-t', '5', '-ac', '1', '-ar', '44100', '-af', 'lowpass=f=3000,dynaudnorm', out]
subprocess.run(cmd(p['video'], wav_video_path))
subprocess.run(cmd(p['audio'], wav_audio_path))
wav_video, fs = sf.read(wav_video_path)
wav_audio, fs = sf.read(wav_audio_path)
corr = correlate(wav_video, wav_audio)
shift = (np.argmax(corr) - int(len(corr)/2)) / fs
print(shift)
os.remove(wav_video_path)
os.remove(wav_audio_path)
c = ['ffmpeg', '-y', '-i', p['video'], '-itsoffset', str(shift), '-i', p['audio'], '-c', 'copy', '-map', '0:v:0', '-map', '1:a:0', p['output']]
subprocess.run(c)
try:
os.rmdir('temp')
except:
pass