Module eff_word_net.audio_processing
Expand source code
import glob
import tflite_runtime.interpreter as tflite
import os
import numpy as np
import random
from pprint import pprint
import json
location = os.path.dirname(os.path.realpath(__file__))
logmelcalc_interpreter = tflite.Interpreter(
model_path=os.path.join(location,"logmelcalc.tflite"
)
)
logmelcalc_interpreter.allocate_tensors()
input_index = logmelcalc_interpreter.get_input_details()[0]["index"]
output_details = logmelcalc_interpreter.get_output_details()
baseModel_interpreter = tflite.Interpreter(
model_path=os.path.join(location,"./baseModel.tflite")
)
baseModel_interpreter.allocate_tensors()
base_model_inp = baseModel_interpreter.get_input_details()
base_model_out = baseModel_interpreter.get_output_details()
def _randomCrop(x:np.array,length=16000)->np.array :
assert(x.shape[0]>length)
frontBits = random.randint(0,x.shape[0]-length)
return x[frontBits:frontBits+length]
def _addPadding(x:np.array,length=16000)->np.array :
assert(x.shape[0]<length)
bitCountToBeAdded = length - x.shape[0]
frontBits = random.randint(0,bitCountToBeAdded)
#print(frontBits, bitCountToBeAdded-frontBits)
new_x = np.append(np.zeros(frontBits),x)
new_x = np.append(new_x,np.zeros(bitCountToBeAdded-frontBits))
return new_x
def _removeExistingPadding(x:np.array)->np.array:
lastZeroBitBeforeAudio = 0
firstZeroBitAfterAudio = len(x)
for i in range(len(x)):
if x[i]==0:
lastZeroBitBeforeAudio = i
else:
break
for i in range(len(x)-1,1,-1):
if x[i]==0:
firstZeroBitAfterAudio = i
else:
break
return x[lastZeroBitBeforeAudio:firstZeroBitAfterAudio]
def fixPaddingIssues(x:np.array,length=16000)-> np.array:
x = _removeExistingPadding(x)
#print("Preprocessing Shape",x.shape[0])
if(x.shape[0]>16000):
return _randomCrop(x,length=length)
elif(x.shape[0]<16000):
return _addPadding(x,length=length)
else:
return x
def audioToVector(inpAudio:np.array) -> np.array :
"""
Converts 16000Hz sampled 1 sec of audio to vector embedding
Inp Parameters :
inpAudio : np.array of shape (16000,)
Out Parameters :
1 vector embedding of shape (128,1)
"""
assert(inpAudio.shape==(16000,))
logmelcalc_interpreter.set_tensor(input_index,np.expand_dims(inpAudio/inpAudio.max(),axis=0).astype("float32"))
logmelcalc_interpreter.invoke()
logmel_output = logmelcalc_interpreter.get_tensor(output_details[0]['index'])
baseModel_interpreter.set_tensor(
base_model_inp[0]["index"],
np.expand_dims(logmel_output,axis=(0,-1)).astype("float32")
)
baseModel_interpreter.invoke()
output_data = baseModel_interpreter.get_tensor(base_model_out[0]['index'])
return output_data
Functions
def audioToVector(inpAudio:
) ‑> -
Converts 16000Hz sampled 1 sec of audio to vector embedding Inp Parameters :
inpAudio : np.array of shape (16000,)
Out Parameters :
1 vector embedding of shape (128,1)
Expand source code
def audioToVector(inpAudio:np.array) -> np.array : """ Converts 16000Hz sampled 1 sec of audio to vector embedding Inp Parameters : inpAudio : np.array of shape (16000,) Out Parameters : 1 vector embedding of shape (128,1) """ assert(inpAudio.shape==(16000,)) logmelcalc_interpreter.set_tensor(input_index,np.expand_dims(inpAudio/inpAudio.max(),axis=0).astype("float32")) logmelcalc_interpreter.invoke() logmel_output = logmelcalc_interpreter.get_tensor(output_details[0]['index']) baseModel_interpreter.set_tensor( base_model_inp[0]["index"], np.expand_dims(logmel_output,axis=(0,-1)).astype("float32") ) baseModel_interpreter.invoke() output_data = baseModel_interpreter.get_tensor(base_model_out[0]['index']) return output_data
def fixPaddingIssues(x:
, length=16000) ‑> -
Expand source code
def fixPaddingIssues(x:np.array,length=16000)-> np.array: x = _removeExistingPadding(x) #print("Preprocessing Shape",x.shape[0]) if(x.shape[0]>16000): return _randomCrop(x,length=length) elif(x.shape[0]<16000): return _addPadding(x,length=length) else: return x