jstoolbox: Sharing code I used to match musical notes to speech

Demo or result of this code can be seen in this youtube video

#------------- video description ------------------------------

SPEECH: "What happens if you take speech audio clip, Get a timed transcript for words, Then play a musical note that closest match each word." - by https://elevenlabs.io/ 1. Recorded using Audacity, exported to "video.wav" at 16000 Hz. 2. Put in same folder as generate.py Python file found on https://github.com/johnafish/whisperer Had a problem running python generate.py but fixed it with this answer: https://github.com/openai/whisper/discussions/120 3. It'll generate caption.srt file in the same folder 4. Then you can get code which I prompted chatGPT to help me with to turn the "caption.srt" along with original "video.wav" to another .wav file which you can then use Audacity to play along/export together like shown in video. (code shown below) Hope you like this. Let me know in comments what you do with it love to see more related to this.

# -------------------------------------------------------------------------------------------------------------------

#---- code starts -------------- This code was mostly by ChatGPT

# I just changed configuration details such as note frequencies and folder path to point

# to to get my mp3 for sounds to use

import pysrt

from pydub import AudioSegment

from pydub.generators import Sine

import librosa

import numpy as np

#from pydub import AudioSegment

#import pysrt

#import librosa

#import numpy as np

# Load the SRT file

subs = pysrt.open('caption.srt')

# Load the WAV file

audio = AudioSegment.from_wav('video.wav')

note_frequencies = {

'C3': 130.81,

'D3': 146.83,

'E3': 164.81,

'F3': 174.61,

'G3': 196.00,

'A3': 220.00,

'B3': 246.94,

'C4': 261.63,

'D4': 293.66,

'E4': 329.63,

'F4': 349.23,

'G4': 392.00,

'A4': 440.00,

'B4': 493.88,

'C5': 523.25,

'D5': 587.33,

'E5': 659.25,

'F5': 698.46,

'G5': 783.99,

'A5': 880.00,

'B5': 987.77,

}

flute_sounds = {note: AudioSegment.from_mp3(f'../songs/{note}.mp3') for note in note_frequencies}

# Function to extract pitch using librosa

def find_closest_note(pitch):

# Find the closest note for the given pitch

closest_note = min(note_frequencies.keys(), key=lambda note: abs(note_frequencies[note] - pitch))

return closest_note

def extract_pitch(audio_segment, sr=16000, n_fft=2048, hop_length=512, fmin=75, fmax=1500):

samples = np.array(audio_segment.get_array_of_samples())

float_samples = librosa.util.buf_to_float(samples, n_bytes=2, dtype=np.float32)

pitches, magnitudes = librosa.piptrack(y=float_samples, sr=sr, n_fft=n_fft, hop_length=hop_length, fmin=fmin, fmax=fmax)

pitches = pitches[magnitudes > np.median(magnitudes)]

if len(pitches) == 0:

return 0

return np.mean(pitches)

def extract_pitch2(audio_segment, sr=16000):

# Convert PyDub audio segment to NumPy array

samples = np.array(audio_segment.get_array_of_samples())

float_samples = librosa.util.buf_to_float(samples, n_bytes=2, dtype=np.float32)

# Extract pitch

pitches, magnitudes = librosa.piptrack(y=float_samples, sr=sr)

pitches = pitches[magnitudes > np.median(magnitudes)]

if len(pitches) == 0:

return 0

return np.mean(pitches)

# Create a new audio segment for the output

output_audio = AudioSegment.silent(duration=len(audio))

for i, sub in enumerate(subs):

start_time = (sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds) * 1000 + sub.start.milliseconds

end_time = (sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds) * 1000 + sub.end.milliseconds

# Ensure the note does not overlap with the next word

if i < len(subs) - 1:

next_start_time = (subs[i + 1].start.hours * 3600 + subs[i + 1].start.minutes * 60 + subs[i + 1].start.seconds) * 1000 + subs[i + 1].start.milliseconds

max_duration = min(end_time, next_start_time) - start_time

else:

max_duration = end_time - start_time

word_audio = audio[start_time:end_time]

pitch = extract_pitch(word_audio)

if pitch > 0:

closest_note = find_closest_note(pitch)

note_audio = flute_sounds[closest_note]

# Adjust the duration of the note

if len(note_audio) > max_duration:

note_audio = note_audio[:max_duration]

output_audio = output_audio.overlay(note_audio, position=start_time)

# Export the output audio

output_audio.export("output_with_flute.wav", format="wav")
#----- code ends -------------

jstoolbox

Sunday, March 10, 2024

Sharing code I used to match musical notes to speech

No comments:

Post a Comment

How to use the Color Picker

Pages