import asyncio import ctypes import logging import pathlib import platform from signal import SIGINT, SIGTERM import numpy as np from livekit import rtc os = platform.system().lower() if os == "windows": lib_file = "whisper.dll" elif os == "darwin": lib_file = "libwhisper.dylib" else: lib_file = "libwhisper.so" whisper_dir = pathlib.Path(__file__).parent.absolute() / "whisper.cpp" libname = str(whisper_dir / lib_file) fname_model = str(whisper_dir / "models/ggml-tiny.en.bin") URL = "ws://localhost:7880" TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE5MDY2MTMyODgsImlzcyI6IkFQSVRzRWZpZFpqclFvWSIsIm5hbWUiOiJuYXRpdmUiLCJuYmYiOjE2NzI2MTMyODgsInN1YiI6Im5hdGl2ZSIsInZpZGVvIjp7InJvb20iOiJ0ZXN0Iiwicm9vbUFkbWluIjp0cnVlLCJyb29tQ3JlYXRlIjp0cnVlLCJyb29tSm9pbiI6dHJ1ZSwicm9vbUxpc3QiOnRydWV9fQ.uSNIangMRu8jZD5mnRYoCHjcsQWCrJXgHCs0aNIgBFY" # noqa # declare the Whisper C API (Only what we need, keep things simple) # also see this issue: https://github.com/ggerganov/whisper.cpp/issues/9 # structure must match https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h class WhisperSamplingStrategy(ctypes.c_int): WHISPER_SAMPLING_GREEDY = 0 WHISPER_SAMPLING_BEAM_SEARCH = 1 class WhisperFullParams(ctypes.Structure): _fields_ = [ ("strategy", ctypes.c_int), ("n_threads", ctypes.c_int), ("n_max_text_ctx", ctypes.c_int), ("offset_ms", ctypes.c_int), ("duration_ms", ctypes.c_int), ("translate", ctypes.c_bool), ("no_context", ctypes.c_bool), ("single_segment", ctypes.c_bool), ("print_special", ctypes.c_bool), ("print_progress", ctypes.c_bool), ("print_realtime", ctypes.c_bool), ("print_timestamps", ctypes.c_bool), ("token_timestamps", ctypes.c_bool), ("thold_pt", ctypes.c_float), ("thold_ptsum", ctypes.c_float), ("max_len", ctypes.c_int), ("split_on_word", ctypes.c_bool), ("max_tokens", ctypes.c_int), ("speed_up", ctypes.c_bool), ("audio_ctx", ctypes.c_int), ("tdrz_enable", ctypes.c_bool), ("initial_prompt", ctypes.c_char_p), ("prompt_tokens", ctypes.c_void_p), ("prompt_n_tokens", ctypes.c_int), ("language", ctypes.c_char_p), ("detect_language", ctypes.c_bool), ("suppress_blank", ctypes.c_bool), ("suppress_non_speech_tokens", ctypes.c_bool), ("temperature", ctypes.c_float), ("max_initial_ts", ctypes.c_float), ("length_penalty", ctypes.c_float), ("temperature_inc", ctypes.c_float), ("entropy_thold", ctypes.c_float), ("logprob_thold", ctypes.c_float), ("no_speech_thold", ctypes.c_float), ("greedy", ctypes.c_int), ("beam_size", ctypes.c_int), ("patience", ctypes.c_float), ("new_segment_callback", ctypes.c_void_p), ("new_segment_callback_user_data", ctypes.c_void_p), ("progress_callback", ctypes.c_void_p), ("progress_callback_user_data", ctypes.c_void_p), ("encoder_begin_callback", ctypes.c_void_p), ("encoder_begin_callback_user_data", ctypes.c_void_p), ("logits_filter_callback", ctypes.c_void_p), ("logits_filter_callback_user_data", ctypes.c_void_p), ] WHISPER_SAMPLE_RATE = 16000 SAMPLES_30_SECS = WHISPER_SAMPLE_RATE * 30 SAMPLES_KEEP = WHISPER_SAMPLE_RATE * 1 # data to keep from the old inference SAMPLES_STEP = WHISPER_SAMPLE_RATE * 3 # 3 seconds of new data whisper = ctypes.CDLL(libname) whisper.whisper_init_from_file.argtypes = [ctypes.c_char_p] whisper.whisper_init_from_file.restype = ctypes.c_void_p whisper.whisper_full_default_params.restype = WhisperFullParams whisper.whisper_full_get_segment_text.restype = ctypes.c_char_p ctx = whisper.whisper_init_from_file(fname_model.encode("utf-8")) async def whisper_task(stream: rtc.AudioStream): data_30_secs = np.zeros(SAMPLES_30_SECS, dtype=np.float32) written_samples = 0 # nb. of samples written to data_30_secs for the cur. inference async for frame in stream: # whisper requires 16kHz mono, so resample the data # also convert the samples from int16 to float32 frame = frame.remix_and_resample(WHISPER_SAMPLE_RATE, 1) data = np.frombuffer(frame.data, dtype=np.int16).astype(np.float32) / 32768.0 # write the data inside data_30_secs at written_samples data_start = SAMPLES_KEEP + written_samples data_30_secs[data_start : data_start + len(data)] = data written_samples += len(data) if written_samples >= SAMPLES_STEP: params = whisper.whisper_full_default_params( WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY ) params.print_realtime = False params.print_progress = False ctx_ptr = ctypes.c_void_p(ctx) data_ptr = data_30_secs.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) res = whisper.whisper_full( ctx_ptr, params, data_ptr, written_samples + SAMPLES_KEEP ) if res != 0: logging.error("error while running inference: %s", res) return n_segments = whisper.whisper_full_n_segments(ctx_ptr) for i in range(n_segments): t0 = whisper.whisper_full_get_segment_t0(ctx_ptr, i) t1 = whisper.whisper_full_get_segment_t1(ctx_ptr, i) txt = whisper.whisper_full_get_segment_text(ctx_ptr, i) logging.info( f"{t0/1000.0:.3f} - {t1/1000.0:.3f} : {txt.decode('utf-8')}" ) # write old data to the beginning of the buffer (SAMPLES_KEEP) data_30_secs[:SAMPLES_KEEP] = data_30_secs[ data_start + written_samples - SAMPLES_KEEP : data_start + written_samples ] written_samples = 0 async def main(room: rtc.Room): @room.on("track_published") def on_track_published( publication: rtc.RemoteTrackPublication, participant: rtc.RemoteParticipant ): # Only subscribe to the audio tracks coming from the microphone if ( publication.kind == rtc.TrackKind.KIND_AUDIO and publication.source == rtc.TrackSource.SOURCE_MICROPHONE ): logging.info( "track published: %s from participant %s (%s), subscribing...", publication.sid, participant.sid, participant.identity, ) publication.set_subscribed(True) @room.on("track_subscribed") def on_track_subscribed( track: rtc.Track, publication: rtc.RemoteTrackPublication, participant: rtc.RemoteParticipant, ): logging.info("starting listening to: %s", participant.identity) audio_stream = rtc.AudioStream(track) asyncio.create_task(whisper_task(audio_stream)) await room.connect(URL, TOKEN, rtc.RoomOptions(auto_subscribe=False)) logging.info("connected to room %s", room.name) # check if there are already published audio tracks for participant in room.participants.values(): for track in participant.tracks.values(): if ( track.kind == rtc.TrackKind.KIND_AUDIO and track.source == rtc.TrackSource.SOURCE_MICROPHONE ): track.set_subscribed(True) if __name__ == "__main__": logging.basicConfig( level=logging.INFO, handlers=[logging.FileHandler("whisper.log"), logging.StreamHandler()], ) loop = asyncio.get_event_loop() room = rtc.Room(loop=loop) async def cleanup(): await room.disconnect() loop.stop() asyncio.ensure_future(main(room)) for signal in [SIGINT, SIGTERM]: loop.add_signal_handler(signal, lambda: asyncio.ensure_future(cleanup())) try: loop.run_forever() finally: loop.close()