voice key detection
Hello all,
I receive quite a few requests to get the script for setting up a voicekey (voice response trigger). Here after you can find the complete script I came up with mixing a bit of everything I found here and there.
https://forum.cogsci.nl/discussion/5965/voice-key-recording
https://forum.cogsci.nl/discussion/8728/voice-key-and-recording
Several interesting functionalities :
- Online voice key detection (without interruption of sound recording) with a very simple algorithm - basically, when a loudness threshold is reached for a certain number of consecutive frames, you trigger something.
- Recording of a .wav file of the vocal response
- Implementation of a more "complex" voice onset algorithm relying on the whole vocal response to define a more accurate threshold
- Recording of a .csv file of the loudness data, timestamps and the identified voice onset
- Plotting of the loudness data with the identified voise onset threshold
To make it work, I use 4 different inline components :
Create participant's folder
# Place this at the top of the experimental sequence to create a participant folder to save all the data import os # Creates a folder for each subject and will save all files there later on newpath = experiment_path + "\\" + str(subject_nr) if not os.path.exists(newpath): os.makedirs(newpath)
Setup of the microphone
# Place this BEFORE the component for which you want to record the sound import pyaudio import struct import math import wave import pandas as pd import numpy as np # Create trial count variable - replace "practice_sequence" by the name of your trial sequence trial_count = count_pratice_sequence + 1 # number of standard deviations above the mean (e.g. 1.5 x SD) taken as the threshold for voice onset detection - CHANGE IF MICROPHONE WAS NOT CLOSE ENOUGH OR DETECTION SETTING WAS NOT HIGH ENOUGH nSD=1 # percentage of the peak signal in a file (e.g. 0.01 = 1 % of peak signal) taken as the minimal threshold value allowed minTH=0.005 # Maximum response time - recording duration timeout = 3000 # Parameters on the PyAudio object FORMAT = pyaudio.paInt16 SHORT_NORMALIZE = (1.0/32768.0) CHANNELS = 2 RATE = 44100 INPUT_BLOCK_TIME = 0.01 INPUT_FRAMES_PER_BLOCK = int(RATE*INPUT_BLOCK_TIME) # Path where sound files will be saved FILENAME = experiment_path + "\\" + str(subject_nr) + "\\" + "p_" + str(trial_count) + ".wav" CHUNCK=1024 p = pyaudio.PyAudio() # Definition of the function to process sound loundness def get_rms(block): """Get root mean square as a measure of loudness""" count = len(block)/2 format = "%dh" % (count) shorts = struct.unpack( format, block ) sum_squares = 0.0 for sample in shorts: n = sample * SHORT_NORMALIZE sum_squares += n*n return math.sqrt( sum_squares / count ) # Opens the microphone stream = p.open( format=FORMAT, channels=CHANNELS, rate=RATE, input=True, input_device_index=0, frames_per_buffer=INPUT_FRAMES_PER_BLOCK )
Sound recording and online feedback
# Place this AFTER the component for which you want to record the sound frames = [] list_ldn = [] list_ct = [] # Listen for sounds until a timeout occurs. start_time = clock.time() print(f"Trial n° {trial_count} starts recording") flag_triggered = False # Initialize the flag loudness_counter = 0 # Initialize the counter for consecutive loudness checks while clock.time() - start_time <= timeout: try: block = stream.read(CHUNCK) frames.append(block) except IOError as e: print(e) loudness = get_rms(block) if loudness > 0.025: loudness_counter += 1 else: loudness_counter = 0 # Checks when the threshold of 0.025 has been reached for 2 consecutive frames for the first time - and ensures that no vocal response is given before 100 ms if loudness_counter >= 2 and not flag_triggered and (clock.time() - start_time) > 0.1: # when the threshold is reached - presentation of 'response_detection_feedback' sketchpad my_canvas = items[u'response_detection_feedback'].canvas my_canvas.text('REPONSE DETECTEE', center=True) my_canvas.show() flag_triggered = True # Set the flag to True after the first trigger list_ldn.append(loudness) list_ct.append(clock.time()) end_time = clock.time() # Close the audio stream stream.stop_stream() stream.close() p.terminate() print("%f ms total duration" % (end_time - start_time)) print(f"Trial n° {trial_count} stops recording", end="\n\n") # Saves the sound files wf = wave.open(FILENAME, 'wb') wf.setnchannels(CHANNELS) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(b''.join(frames)) wf.close()
Data saving and threshold detection
# Place this at the very end of the trial_sequence - preferably just after a blank screen to buffer any potential lags import pandas as pd import numpy as np import plotly.express as px # Creates a dataframe with clock.time, response, and trial time df = pd.DataFrame({'clock_time': list_ct, 'loudness': list_ldn}) df['trial_time'] = df['clock_time'] - start_time # Calculate the mean and standard deviation of the 'loudness' column loudness_mean = np.mean(df['loudness']) loudness_std = np.std(df['loudness']) # Calculate the threshold based on nSD threshold_nSD = loudness_mean + loudness_std * nSD # Calculate the peak loudness peak_loudness = np.max(df['loudness']) # Calculate the threshold based on minTH percentage of peak loudness threshold_minTH = minTH * peak_loudness # Determine the sound threshold as the higher of the two calculated thresholds sound_threshold = max(threshold_nSD, threshold_minTH) # Compute response time only if clock.time() - start_time > 100 ms if df['trial_time'].iloc[-1] > 0.1: try: response_time = df.loc[df['loudness'] > sound_threshold, 'trial_time'].iloc[0] response = 1 var.trial_start = start_time var.trial_end = end_time var.loudness = df['loudness'].tolist() except IndexError: response_time = None response = 0 var.trial_start = start_time var.trial_end = end_time var.loudness = None else: response_time = None response = 0 var.trial_start = start_time var.trial_end = end_time var.loudness = None if response == 1: print("Vocal key detected") else: print("Vocal key not detected") # If a keyboard response is also used, this will append the suffix 'voice_key' to OS variables response and response_time responses.add(response=response, response_time=response_time, item='voice_key') # Lists the variables to include in the data file - DO NOT use a logger if you use this. INCLUDE = ['subject_nr', 'trial_count', 'practice', 'response_time', 'correct', 'response', 'trial_start', 'trial_end', 'loudness', 'target', 'sound_threshold', 'id_acteur', 'genre_acteur', 'age_acteur', 'emotion_face', 'emotion_body', 'congruence', 'condition', 'image', 'x_coordination', 'direction', 'correct_response'] log.write_vars([log_var for log_var in var.inspect() if log_var in INCLUDE]) # Add new variables to the DataFrame df['subject_nr'] = subject_nr df['trial_count'] = trial_count df['response_threshold'] = (df['loudness'] > sound_threshold).astype(int) # Save the DataFrame to a CSV file csv_filename = f"{experiment_path}\\{subject_nr}\\p_{trial_count}.csv" df.to_csv(csv_filename, index=False) # Optional: Create and save an interactive plot - comment out if unnecessary fig = px.line(df, x="trial_time", y="loudness", markers=True) fig.update_yaxes(range=[0, 0.05]) if response_time is not None: fig.add_vline(x=response_time) fig.update_layout( title_text=f"The response time was {response_time}" ) else: fig.update_layout( title_text="No response was detected") fig.write_html(experiment_path + "\\" + str(subject_nr) + "\\"+ str(trial_count) + ".html")
It is not perfect but it does the job - I hope this helps and I would love to get feedbacks if you notice anything
Comments
Thank you! 🤗 Really great to see that you're sharing this.
Check out SigmundAI.eu for our OpenSesame AI assistant!