Demo or result of this code can be seen in this youtube video
#------------- video description ------------------------------
SPEECH: "What happens if you take speech audio clip, Get a timed transcript for words, Then play a musical note that closest match each word." - by https://elevenlabs.io/ 1. Recorded using Audacity, exported to "video.wav" at 16000 Hz. 2. Put in same folder as generate.py Python file found on https://github.com/johnafish/whisperer Had a problem running python generate.py but fixed it with this answer: https://github.com/openai/whisper/discussions/120 3. It'll generate caption.srt file in the same folder 4. Then you can get code which I prompted chatGPT to help me with to turn the "caption.srt" along with original "video.wav" to another .wav file which you can then use Audacity to play along/export together like shown in video. (code shown below) Hope you like this. Let me know in comments what you do with it love to see more related to this.
# -------------------------------------------------------------------------------------------------------------------
#---- code starts -------------- This code was mostly by ChatGPT
# I just changed configuration details such as note frequencies and folder path to point
# to to get my mp3 for sounds to use
import pysrt
from pydub import AudioSegment
from pydub.generators import Sine
import librosa
import numpy as np
#from pydub import AudioSegment
#import pysrt
#import librosa
#import numpy as np
# Load the SRT file
subs = pysrt.open('caption.srt')
# Load the WAV file
audio = AudioSegment.from_wav('video.wav')
note_frequencies = {
'C3': 130.81,
'D3': 146.83,
'E3': 164.81,
'F3': 174.61,
'G3': 196.00,
'A3': 220.00,
'B3': 246.94,
'C4': 261.63,
'D4': 293.66,
'E4': 329.63,
'F4': 349.23,
'G4': 392.00,
'A4': 440.00,
'B4': 493.88,
'C5': 523.25,
'D5': 587.33,
'E5': 659.25,
'F5': 698.46,
'G5': 783.99,
'A5': 880.00,
'B5': 987.77,
}
flute_sounds = {note: AudioSegment.from_mp3(f'../songs/{note}.mp3') for note in note_frequencies}
# Function to extract pitch using librosa
def find_closest_note(pitch):
# Find the closest note for the given pitch
closest_note = min(note_frequencies.keys(), key=lambda note: abs(note_frequencies[note] - pitch))
return closest_note
def extract_pitch(audio_segment, sr=16000, n_fft=2048, hop_length=512, fmin=75, fmax=1500):
samples = np.array(audio_segment.get_array_of_samples())
float_samples = librosa.util.buf_to_float(samples, n_bytes=2, dtype=np.float32)
pitches, magnitudes = librosa.piptrack(y=float_samples, sr=sr, n_fft=n_fft, hop_length=hop_length, fmin=fmin, fmax=fmax)
pitches = pitches[magnitudes > np.median(magnitudes)]
if len(pitches) == 0:
return 0
return np.mean(pitches)
def extract_pitch2(audio_segment, sr=16000):
# Convert PyDub audio segment to NumPy array
samples = np.array(audio_segment.get_array_of_samples())
float_samples = librosa.util.buf_to_float(samples, n_bytes=2, dtype=np.float32)
# Extract pitch
pitches, magnitudes = librosa.piptrack(y=float_samples, sr=sr)
pitches = pitches[magnitudes > np.median(magnitudes)]
if len(pitches) == 0:
return 0
return np.mean(pitches)
# Create a new audio segment for the output
output_audio = AudioSegment.silent(duration=len(audio))
for i, sub in enumerate(subs):
start_time = (sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds) * 1000 + sub.start.milliseconds
end_time = (sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds) * 1000 + sub.end.milliseconds
# Ensure the note does not overlap with the next word
if i < len(subs) - 1:
next_start_time = (subs[i + 1].start.hours * 3600 + subs[i + 1].start.minutes * 60 + subs[i + 1].start.seconds) * 1000 + subs[i + 1].start.milliseconds
max_duration = min(end_time, next_start_time) - start_time
else:
max_duration = end_time - start_time
word_audio = audio[start_time:end_time]
pitch = extract_pitch(word_audio)
if pitch > 0:
closest_note = find_closest_note(pitch)
note_audio = flute_sounds[closest_note]
# Adjust the duration of the note
if len(note_audio) > max_duration:
note_audio = note_audio[:max_duration]
output_audio = output_audio.overlay(note_audio, position=start_time)
# Export the output audio
output_audio.export("output_with_flute.wav", format="wav")
#----- code ends -------------
No comments:
Post a Comment