Hindi Language Transcriptions Speech to Text STT using VOSK Model

Vosk is an open-source speech recognition toolkit widely used for building speech-to-text (STT) systems. It supports multiple languages, including Hindi, and can be used in various applications due to its efficient, real-time transcription capabilities. Here’s an overview of how to use the Vosk model for Hindi language transcriptions in a speech-to-text application:

Overview of the Vosk Model

Vosk is a part of the Kaldi speech recognition toolkit known for its accuracy and efficiency. Vosk provides a set of pre-trained models that can be used out-of-the-box for recognizing speech in different languages.

Key Features of Vosk

Multilingual Support: Supports multiple languages including Hindi.
Offline Capabilities: Can run offline without needing an internet connection.
Real-time Processing: Capable of processing speech in real-time.
Compact Models: Efficient models that can run on various devices, including mobile phones and Raspberry Pi.

Here is the code you provided:

import os
import sys
import pyaudio
import time
import threading
from vosk import Model, KaldiRecognizer
from gtts import gTTS
from playsound import playsound

# Path to the Vosk model and its configuration
model_path = "vosk_model/Vosk_small_hindi"
if not os.path.exists(model_path):
    print(f"Please download a Vosk model to {model_path}")
    sys.exit(1)

# Initialize the Vosk model and recognizer
model = Model(model_path)
rec = KaldiRecognizer(model, 16000)

# Setup audio stream
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("Listening...")

# Flag to prevent transcription during TTS playback
transcribing_enabled = True


def play_tts(text):
    global transcribing_enabled
    transcribing_enabled = False  # Disable transcription temporarily
    tts = gTTS(text=text, lang='hi')
    tts.save("jai_shri_ram.mp3")
    playsound("jai_shri_ram.mp3")
    os.remove("jai_shri_ram.mp3")
    transcribing_enabled = True  # Re-enable transcription


try:
    while True:
        try:
            data = stream.read(CHUNK, exception_on_overflow=False)
        except OSError as e:
            print(f"Error reading audio stream: {e}")
            continue

        if len(data) == 0:
            break

        if transcribing_enabled and rec.AcceptWaveform(data):
            result = rec.Result()
            result_dict = eval(result)  # Convert JSON string to dictionary
            if 'text' in result_dict:
                timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
                transcription = result_dict['text']
                print(f"[{timestamp}] {transcription}")

                if "जय श्री राम" in transcription or "जय श्रीराम" in transcription:
                    tts_thread = threading.Thread(target=play_tts, args=("जय श्री राम",))
                    tts_thread.start()

                if "नाव" in transcription or "नौ" in transcription:
                    tts_thread = threading.Thread(target=play_tts, args=("आपका स्वागत है। मैं एक नव रोबोट हूँ। मैं आपकी कैसे मदद कर सकता हूँ?",))
                    tts_thread.start()


except KeyboardInterrupt:
    print("Stopping...")
finally:
    try:
        stream.stop_stream()
    except OSError as e:
        print(f"Error stopping stream: {e}")
    stream.close()
    p.terminate()

This script sets up a voice recognition system using the Vosk speech recognition toolkit and provides responses with text-to-speech using the gTTS library. It listens for specific phrases in Hindi and responds accordingly.

Explanation of Code

Importing Libraries: The necessary libraries are imported. vosk is for speech recognition, and wave is for handling audio files.
Loading the Model: The Vosk model is loaded from the specified directory.
Recognizing Speech: The function recognize_speech_from_file processes a WAV file. It checks if the audio format is correct and uses KaldiRecognizer to transcribe the audio.
Reading Audio Data: Audio data is read in chunks and processed to produce partial and final results, which are printed to the console.

Applications

Transcription Services: Automated Hindi audio transcription for documentation, media, and content creation.
Assistive Technologies: Helping individuals with hearing impairments by converting speech to text in real-time.
Voice Commands: Enabling voice control for applications in Hindi.

By using the Vosk model for Hindi, developers can create robust and efficient speech-to-text applications tailored for Hindi-speaking users.

Updated Code as per Long Transcriptions

import os
import sys
import pyaudio
import time
import threading
from vosk import Model, KaldiRecognizer
from gtts import gTTS
from playsound import playsound

# Path to the Vosk model and its configuration
model_path = "vosk_model/Vosk_small_hindi"
if not os.path.exists(model_path):
    print(f"Please download a Vosk model to {model_path}")
    sys.exit(1)

# Initialize the Vosk model and recognizer
model = Model(model_path)
rec = KaldiRecognizer(model, 16000)

# Setup audio stream
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("Listening...")

# Flag to prevent transcription during TTS playback
transcribing_enabled = True

def play_tts(text):
    global transcribing_enabled
    transcribing_enabled = False  # Disable transcription temporarily
    tts = gTTS(text=text, lang='hi')
    tts.save("jai_shri_ram.mp3")
    playsound("jai_shri_ram.mp3")
    os.remove("jai_shri_ram.mp3")
    transcribing_enabled = True  # Re-enable transcription

def process_result(result):
    result_dict = eval(result)  # Convert JSON string to dictionary
    if 'text' in result_dict:
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        transcription = result_dict['text']
        print(f"[{timestamp}] {transcription}")

        if "जय श्री राम" in transcription or "जय श्रीराम" in transcription:
            tts_thread = threading.Thread(target=play_tts, args=("जय श्री राम",))
            tts_thread.start()

        if "नाव" in transcription or "नौ" in transcription:
            tts_thread = threading.Thread(target=play_tts, args=("आपका स्वागत है। मैं एक नव रोबोट हूँ। मैं आपकी कैसे मदद कर सकता हूँ?",))
            tts_thread.start()

try:
    partial_transcription = ""
    while True:
        try:
            data = stream.read(CHUNK, exception_on_overflow=False)
        except OSError as e:
            print(f"Error reading audio stream: {e}")
            continue

        if len(data) == 0:
            break

        if transcribing_enabled:
            if rec.AcceptWaveform(data):
                result = rec.Result()
                process_result(result)
                partial_transcription = ""
            else:
                partial_result = rec.PartialResult()
                partial_transcription = eval(partial_result).get('partial', '')
                # print(f"Partial: {partial_transcription}")

except KeyboardInterrupt:
    print("Stopping...")
finally:
    try:
        stream.stop_stream()
    except OSError as e:
        print(f"Error stopping stream: {e}")
    stream.close()
    p.terminate()

Facebook SDK

Hindi Language Transcriptions Speech to Text STT using VOSK Model

Overview of the Vosk Model

Key Features of Vosk

Explanation of Code

Applications

Contact Form