voice key detection
Hello all,
I receive quite a few requests to get the script for setting up a voicekey (voice response trigger). Here after you can find the complete script I came up with mixing a bit of everything I found here and there.
https://forum.cogsci.nl/discussion/5965/voice-key-recording
https://forum.cogsci.nl/discussion/8728/voice-key-and-recording
Several interesting functionalities :
- Online voice key detection (without interruption of sound recording) with a very simple algorithm - basically, when a loudness threshold is reached for a certain number of consecutive frames, you trigger something.
- Recording of a .wav file of the vocal response
- Implementation of a more "complex" voice onset algorithm relying on the whole vocal response to define a more accurate threshold
- Recording of a .csv file of the loudness data, timestamps and the identified voice onset
- Plotting of the loudness data with the identified voise onset threshold
To make it work, I use 4 different inline components :
Create participant's folder
# Place this at the top of the experimental sequence to create a participant folder to save all the data import os # Creates a folder for each subject and will save all files there later on newpath = experiment_path + "\\" + str(subject_nr) if not os.path.exists(newpath): os.makedirs(newpath)
Setup of the microphone
# Place this BEFORE the component for which you want to record the sound import pyaudio import struct import math import wave import pandas as pd import numpy as np # Create trial count variable - replace "practice_sequence" by the name of your trial sequence trial_count = count_pratice_sequence + 1 # number of standard deviations above the mean (e.g. 1.5 x SD) taken as the threshold for voice onset detection - CHANGE IF MICROPHONE WAS NOT CLOSE ENOUGH OR DETECTION SETTING WAS NOT HIGH ENOUGH nSD=1 # percentage of the peak signal in a file (e.g. 0.01 = 1 % of peak signal) taken as the minimal threshold value allowed minTH=0.005 # Maximum response time - recording duration timeout = 3000 # Parameters on the PyAudio object FORMAT = pyaudio.paInt16 SHORT_NORMALIZE = (1.0/32768.0) CHANNELS = 2 RATE = 44100 INPUT_BLOCK_TIME = 0.01 INPUT_FRAMES_PER_BLOCK = int(RATE*INPUT_BLOCK_TIME) # Path where sound files will be saved FILENAME = experiment_path + "\\" + str(subject_nr) + "\\" + "p_" + str(trial_count) + ".wav" CHUNCK=1024 p = pyaudio.PyAudio() # Definition of the function to process sound loundness def get_rms(block): """Get root mean square as a measure of loudness""" count = len(block)/2 format = "%dh" % (count) shorts = struct.unpack( format, block ) sum_squares = 0.0 for sample in shorts: n = sample * SHORT_NORMALIZE sum_squares += n*n return math.sqrt( sum_squares / count ) # Opens the microphone stream = p.open( format=FORMAT, channels=CHANNELS, rate=RATE, input=True, input_device_index=0, frames_per_buffer=INPUT_FRAMES_PER_BLOCK )
Sound recording and online feedback
# Place this AFTER the component for which you want to record the sound
frames = []
list_ldn = []
list_ct = []
# Listen for sounds until a timeout occurs.
start_time = clock.time()
print(f"Trial n° {trial_count} starts recording")
flag_triggered = False # Initialize the flag
loudness_counter = 0 # Initialize the counter for consecutive loudness checks
while clock.time() - start_time <= timeout:
try:
block = stream.read(CHUNCK)
frames.append(block)
except IOError as e:
print(e)
loudness = get_rms(block)
if loudness > 0.025:
loudness_counter += 1
else:
loudness_counter = 0
# Checks when the threshold of 0.025 has been reached for 2 consecutive frames for the first time - and ensures that no vocal response is given before 100 ms
if loudness_counter >= 2 and not flag_triggered and (clock.time() - start_time) > 0.1:
# when the threshold is reached - presentation of 'response_detection_feedback' sketchpad
my_canvas = items[u'response_detection_feedback'].canvas
my_canvas.text('REPONSE DETECTEE', center=True)
my_canvas.show()
flag_triggered = True # Set the flag to True after the first trigger
list_ldn.append(loudness)
list_ct.append(clock.time())
end_time = clock.time()
# Close the audio stream
stream.stop_stream()
stream.close()
p.terminate()
print("%f ms total duration" % (end_time - start_time))
print(f"Trial n° {trial_count} stops recording", end="\n\n")
# Saves the sound files
wf = wave.open(FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
Data saving and threshold detection
# Place this at the very end of the trial_sequence - preferably just after a blank screen to buffer any potential lags
import pandas as pd
import numpy as np
import plotly.express as px
# Creates a dataframe with clock.time, response, and trial time
df = pd.DataFrame({'clock_time': list_ct, 'loudness': list_ldn})
df['trial_time'] = df['clock_time'] - start_time
# Calculate the mean and standard deviation of the 'loudness' column
loudness_mean = np.mean(df['loudness'])
loudness_std = np.std(df['loudness'])
# Calculate the threshold based on nSD
threshold_nSD = loudness_mean + loudness_std * nSD
# Calculate the peak loudness
peak_loudness = np.max(df['loudness'])
# Calculate the threshold based on minTH percentage of peak loudness
threshold_minTH = minTH * peak_loudness
# Determine the sound threshold as the higher of the two calculated thresholds
sound_threshold = max(threshold_nSD, threshold_minTH)
# Compute response time only if clock.time() - start_time > 100 ms
if df['trial_time'].iloc[-1] > 0.1:
try:
response_time = df.loc[df['loudness'] > sound_threshold, 'trial_time'].iloc[0]
response = 1
var.trial_start = start_time
var.trial_end = end_time
var.loudness = df['loudness'].tolist()
except IndexError:
response_time = None
response = 0
var.trial_start = start_time
var.trial_end = end_time
var.loudness = None
else:
response_time = None
response = 0
var.trial_start = start_time
var.trial_end = end_time
var.loudness = None
if response == 1:
print("Vocal key detected")
else:
print("Vocal key not detected")
# If a keyboard response is also used, this will append the suffix 'voice_key' to OS variables response and response_time
responses.add(response=response, response_time=response_time, item='voice_key')
# Lists the variables to include in the data file - DO NOT use a logger if you use this.
INCLUDE = ['subject_nr', 'trial_count', 'practice', 'response_time', 'correct', 'response', 'trial_start', 'trial_end', 'loudness', 'target', 'sound_threshold', 'id_acteur', 'genre_acteur', 'age_acteur', 'emotion_face', 'emotion_body', 'congruence', 'condition', 'image', 'x_coordination', 'direction', 'correct_response']
log.write_vars([log_var for log_var in var.inspect() if log_var in INCLUDE])
# Add new variables to the DataFrame
df['subject_nr'] = subject_nr
df['trial_count'] = trial_count
df['response_threshold'] = (df['loudness'] > sound_threshold).astype(int)
# Save the DataFrame to a CSV file
csv_filename = f"{experiment_path}\\{subject_nr}\\p_{trial_count}.csv"
df.to_csv(csv_filename, index=False)
# Optional: Create and save an interactive plot - comment out if unnecessary
fig = px.line(df, x="trial_time", y="loudness", markers=True)
fig.update_yaxes(range=[0, 0.05])
if response_time is not None:
fig.add_vline(x=response_time)
fig.update_layout(
title_text=f"The response time was {response_time}"
)
else:
fig.update_layout(
title_text="No response was detected")
fig.write_html(experiment_path + "\\" + str(subject_nr) + "\\"+ str(trial_count) + ".html")
It is not perfect but it does the job - I hope this helps and I would love to get feedbacks if you notice anything
Comments
Thank you! 🤗 Really great to see that you're sharing this.
Check out SigmundAI.eu for our OpenSesame AI assistant!