Sunday, March 10, 2024

Sharing code I used to match musical notes to speech

Demo or result of this code can be seen in this youtube video

#------------- video description ------------------------------

SPEECH: "What happens if you take speech audio clip, Get a timed transcript for words, Then play a musical note that closest match each word." - by https://elevenlabs.io/ 1. Recorded using Audacity, exported to "video.wav" at 16000 Hz. 2. Put in same folder as generate.py Python file found on https://github.com/johnafish/whisperer Had a problem running python generate.py but fixed it with this answer: https://github.com/openai/whisper/discussions/120 3. It'll generate caption.srt file in the same folder 4. Then you can get code which I prompted chatGPT to help me with to turn the "caption.srt" along with original "video.wav" to another .wav file which you can then use Audacity to play along/export together like shown in video. (code shown below) Hope you like this. Let me know in comments what you do with it love to see more related to this.



# -------------------------------------------------------------------------------------------------------------------

#---- code starts -------------- This code was mostly by ChatGPT

# I just changed configuration details such as note frequencies and folder path to point

# to to get my mp3 for sounds to use

import pysrt

from pydub import AudioSegment

from pydub.generators import Sine

import librosa

import numpy as np

#from pydub import AudioSegment

#import pysrt

#import librosa

#import numpy as np


# Load the SRT file

subs = pysrt.open('caption.srt')


# Load the WAV file

audio = AudioSegment.from_wav('video.wav')

note_frequencies = {

    'C3': 130.81,

    'D3': 146.83,

    'E3': 164.81,

    'F3': 174.61,

    'G3': 196.00,

    'A3': 220.00,

    'B3': 246.94,

    'C4': 261.63,

    'D4': 293.66,

    'E4': 329.63,

    'F4': 349.23,

    'G4': 392.00,

    'A4': 440.00,

    'B4': 493.88,

    'C5': 523.25,

    'D5': 587.33,

    'E5': 659.25,

    'F5': 698.46,

    'G5': 783.99,

    'A5': 880.00,

    'B5': 987.77,

}

flute_sounds = {note: AudioSegment.from_mp3(f'../songs/{note}.mp3') for note in note_frequencies}

# Function to extract pitch using librosa

def find_closest_note(pitch):

    # Find the closest note for the given pitch

    closest_note = min(note_frequencies.keys(), key=lambda note: abs(note_frequencies[note] - pitch))

    return closest_note

def extract_pitch(audio_segment, sr=16000, n_fft=2048, hop_length=512, fmin=75, fmax=1500):

    samples = np.array(audio_segment.get_array_of_samples())

    float_samples = librosa.util.buf_to_float(samples, n_bytes=2, dtype=np.float32)

    

    pitches, magnitudes = librosa.piptrack(y=float_samples, sr=sr, n_fft=n_fft, hop_length=hop_length, fmin=fmin, fmax=fmax)

    pitches = pitches[magnitudes > np.median(magnitudes)]

    if len(pitches) == 0:

        return 0

    return np.mean(pitches)

def extract_pitch2(audio_segment, sr=16000):

    # Convert PyDub audio segment to NumPy array

    samples = np.array(audio_segment.get_array_of_samples())

    float_samples = librosa.util.buf_to_float(samples, n_bytes=2, dtype=np.float32)

    

    # Extract pitch

    pitches, magnitudes = librosa.piptrack(y=float_samples, sr=sr)

    pitches = pitches[magnitudes > np.median(magnitudes)]

    if len(pitches) == 0:

        return 0

    return np.mean(pitches)


# Create a new audio segment for the output

output_audio = AudioSegment.silent(duration=len(audio))


for i, sub in enumerate(subs):

    start_time = (sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds) * 1000 + sub.start.milliseconds

    end_time = (sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds) * 1000 + sub.end.milliseconds


    # Ensure the note does not overlap with the next word

    if i < len(subs) - 1:

        next_start_time = (subs[i + 1].start.hours * 3600 + subs[i + 1].start.minutes * 60 + subs[i + 1].start.seconds) * 1000 + subs[i + 1].start.milliseconds

        max_duration = min(end_time, next_start_time) - start_time

    else:

        max_duration = end_time - start_time


    word_audio = audio[start_time:end_time]

    pitch = extract_pitch(word_audio)

    if pitch > 0:

        closest_note = find_closest_note(pitch)

        note_audio = flute_sounds[closest_note]

        

        # Adjust the duration of the note

        if len(note_audio) > max_duration:

            note_audio = note_audio[:max_duration]

        

        output_audio = output_audio.overlay(note_audio, position=start_time)


# Export the output audio

output_audio.export("output_with_flute.wav", format="wav")
#----- code ends -------------

No comments:

Post a Comment

How to use the Color Picker

 Depending on the Color Picker, there will be minor differences (there are already step by step Instructions). But this is graphical Instruc...