From 920a7633cdf261846cc55bc31bc3c75142edc90c Mon Sep 17 00:00:00 2001 From: Vijay Yadev Date: Wed, 11 Nov 2020 21:57:04 -0500 Subject: [PATCH] nlp_transcribe --- Dockerfile | 13 ++- dbm_lib/config/config_raw_feature.py | 3 + dbm_lib/controller/process_feature.py | 15 ++++ .../raw_features/nlp/transcribe.py | 82 +++++++++++++++++++ .../raw_features/util/nlp_util.py | 66 +++++++++++++++ process_data.py | 6 ++ process_dbm.sh | 3 + requirements.txt | 1 + resources/features/raw_feature.yml | 3 + 9 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 dbm_lib/dbm_features/raw_features/nlp/transcribe.py create mode 100644 dbm_lib/dbm_features/raw_features/util/nlp_util.py diff --git a/Dockerfile b/Dockerfile index c3dfd352..5d425d61 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,9 @@ RUN apt-get update && apt-get install -y python3-pip \ && apt-get install -y libavcodec-dev \ && apt-get install -y libavformat-dev \ && apt-get install -y libavdevice-dev \ - && apt-get install -y libboost-all-dev + && apt-get install -y libboost-all-dev \ + && apt-get install -y git \ + && apt-get install -y sox RUN ln -sfn /usr/bin/pip3 /usr/bin/pip COPY . /app @@ -24,8 +26,15 @@ RUN dpkg --configure -a RUN su -c ./install.sh RUN echo "Done OpenFace!" -WORKDIR /app +RUN echo "Cloning DeepSpeech..." +WORKDIR /app/pkg +RUN git clone https://github.com/mozilla/DeepSpeech.git +WORKDIR /app/pkg/DeepSpeech +RUN wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.1/deepspeech-0.9.1-models.pbmm +RUN wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.1/deepspeech-0.9.1-models.scorer + +WORKDIR /app RUN pip install --upgrade pip RUN pip install -r requirements.txt RUN echo "Requirement txt done!" diff --git a/dbm_lib/config/config_raw_feature.py b/dbm_lib/config/config_raw_feature.py index 5494e21d..679c0845 100644 --- a/dbm_lib/config/config_raw_feature.py +++ b/dbm_lib/config/config_raw_feature.py @@ -222,4 +222,7 @@ class ConfigRawReader(object): self.mov_Hpose_Yaw = config['raw_feature']['mov_Hpose_Yaw'] self.mov_Hpose_Roll = config['raw_feature']['mov_Hpose_Roll'] self.mov_Hpose_Dist = config['raw_feature']['mov_Hpose_Dist'] + + #NLP features + self.nlp_transcribe = config['raw_feature']['nlp_transcribe'] \ No newline at end of file diff --git a/dbm_lib/controller/process_feature.py b/dbm_lib/controller/process_feature.py index 3282edc2..902ef2c6 100644 --- a/dbm_lib/controller/process_feature.py +++ b/dbm_lib/controller/process_feature.py @@ -8,6 +8,7 @@ from dbm_lib.dbm_features.raw_features.audio import intensity, pitch_freq, hnr, from dbm_lib.dbm_features.raw_features.audio import pause_segment, jitter, shimmer, mfcc from dbm_lib.dbm_features.raw_features.video import face_asymmetry, face_au, face_emotion_expressivity, face_landmark from dbm_lib.dbm_features.raw_features.movement import head_motion, eye_blink +from dbm_lib.dbm_features.raw_features.nlp import transcribe import subprocess import logging @@ -123,6 +124,20 @@ def process_movement(video_uri, out_dir, dbm_group, r_config, dlib_model): logger.info('processing eye blink....') eye_blink.run_eye_blink(video_uri, out_dir, r_config, dlib_model) +def process_nlp(video_uri, out_dir, dbm_group, r_config, deep_path): + """ + processing nlp features + Args: + video_uri: video path; out_dir: raw variable output dir + dbm_group: list of features to process; r_config: raw feature config object + deep_path: deep speech build path + """ + if dbm_group != None and len(dbm_group)>0 and 'nlp' not in dbm_group: + return + + logger.info('Processing nlp variables from data in {}'.format(video_uri)) + transcribe.run_transcribe(video_uri, out_dir, r_config, deep_path) + def remove_file(file_path): """ removing wav file diff --git a/dbm_lib/dbm_features/raw_features/nlp/transcribe.py b/dbm_lib/dbm_features/raw_features/nlp/transcribe.py new file mode 100644 index 00000000..3914f78a --- /dev/null +++ b/dbm_lib/dbm_features/raw_features/nlp/transcribe.py @@ -0,0 +1,82 @@ +""" +file_name: transcribe +project_name: DBM +created: 2020-10-11 +""" + +import pandas as pd +import numpy as np +import librosa +import glob +from os.path import join +import logging + +from dbm_lib.dbm_features.raw_features.util import util as ut +from dbm_lib.dbm_features.raw_features.util import nlp_util as n_util + +logging.basicConfig(level=logging.INFO) +logger=logging.getLogger() + +formant_dir = 'nlp/transcribe' +csv_ext = '_transcribe.csv' +error_txt = 'error: length less than 0.1' + +def calc_transcribe(video_uri, audio_file, out_loc, fl_name, r_config, deep_path): + """ + Preparing Formant freq matrix + Args: + audio_file: (.wav) parsed audio file; fl_name: input file name + out_loc: (str) Output directory; r_config: raw variable config + """ + + text = n_util.process_deepspeech(audio_file, deep_path) + df_formant = pd.DataFrame([text], columns=[r_config.nlp_transcribe]) + + df_formant.replace('', np.nan, regex=True,inplace=True) + df_formant[r_config.err_reason] = 'Pass'# will replace with threshold in future release + df_formant['dbm_master_url'] = video_uri + + logger.info('Saving Output file {} '.format(out_loc)) + ut.save_output(df_formant, out_loc, fl_name, formant_dir, csv_ext) + +def empty_transcribe(video_uri, out_loc, fl_name, r_config): + + """ + Preparing empty formant frequency matrix if something fails + """ + cols = [r_config.nlp_transcribe, r_config.err_reason] + out_val = [[np.nan, error_txt]] + df_fm = pd.DataFrame(out_val, columns = cols) + df_fm['dbm_master_url'] = video_uri + + logger.info('Saving Output file {} '.format(out_loc)) + ut.save_output(df_fm, out_loc, fl_name, formant_dir, csv_ext) + +def run_transcribe(video_uri, out_dir, r_config, deep_path): + + """ + Processing all patient's for fetching Formant freq + --------------- + --------------- + Args: + video_uri: video path; r_config: raw variable config object + out_dir: (str) Output directory for processed output; deep_path: deepspeech build path + """ + try: + + input_loc, out_loc, fl_name = ut.filter_path(video_uri, out_dir) + aud_filter = glob.glob(join(input_loc, fl_name + '.wav')) + if len(aud_filter)>0: + + audio_file = aud_filter[0] + aud_dur = librosa.get_duration(filename=audio_file) + + if float(aud_dur) < 0.1: + logger.info('Output file {} size is less than 0.1 sec'.format(audio_file)) + + empty_transcribe(video_uri, out_loc, fl_name, r_config) + return + + calc_transcribe(video_uri, audio_file, out_loc, fl_name, r_config, deep_path) + except Exception as e: + logger.error('Failed to process audio file') \ No newline at end of file diff --git a/dbm_lib/dbm_features/raw_features/util/nlp_util.py b/dbm_lib/dbm_features/raw_features/util/nlp_util.py new file mode 100644 index 00000000..3288240b --- /dev/null +++ b/dbm_lib/dbm_features/raw_features/util/nlp_util.py @@ -0,0 +1,66 @@ +""" +file_name: nlp_util +project_name: DBM +created: 2020-10-11 +""" + +import subprocess +import json +import numpy as np +import pandas as pd +import os +import logging + +logging.basicConfig(level=logging.INFO) +logger=logging.getLogger() + +#Speech to text using Deepspeech 0.9.1 +def deepspeech(AUDIO_FILE,deep_path): + """ + Extracting text from audio using Deep Speech neural network trained model + Returns: + Text: text which is extracted from audio + """ + api = 'deepspeech' + arg_speech0 = '--model' + arg_speech_path0 = os.path.join(deep_path, 'deepspeech-0.9.1-models.pbmm') + arg_speech1 = '--scorer' + arg_speech_path1 = os.path.join(deep_path, 'deepspeech-0.9.1-models.scorer') + arg_audio = "--audio" + + out = subprocess.Popen([api, arg_speech0, arg_speech_path0, arg_speech1, arg_speech_path1, arg_audio, AUDIO_FILE], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + logger.info('Deepspeech output...... {}'.format(out)) + try: + stdout,stderr = out.communicate() + except: + return "error", "error" + print(stderr) + return stdout,stderr + +def deep_speech_output_clean(result): + """ + Parsing deep speech output(text) + Return: + Text from speech + """ + text = "" + if len(result)>0: + res_split = str(result[0]).split('\\n') + + if len(res_split)>0: + for i in range(len(res_split)): + if 'Inference took' in res_split[i]: + text = res_split[i + 1] + return text + return text + +def process_deepspeech(audio_file,deep_path): + """ + Transcribing audio to extract text from speech + """ + deep_output = deepspeech(audio_file,deep_path) + deep_text= deep_speech_output_clean(deep_output) + + return deep_text diff --git a/process_data.py b/process_data.py index 0283f9f2..90b37d43 100644 --- a/process_data.py +++ b/process_data.py @@ -20,6 +20,7 @@ logging.basicConfig(level=logging.INFO) logger=logging.getLogger() OPENFACE_PATH = 'pkg/OpenFace/build/bin/FeatureExtraction' +DEEP_SPEECH = 'pkg/DeepSpeech' DLIB_SHAPE_MODEL = 'pkg/shape_detector/shape_predictor_68_face_landmarks.dat' def common_video(video_file, args, r_config): @@ -36,6 +37,8 @@ def common_video(video_file, args, r_config): pf.process_facial(video_file, out_path, args.dbm_group, r_config) pf.process_acoustic(video_file, out_path, args.dbm_group, r_config) + pf.process_nlp(video_file, out_path, args.dbm_group, r_config, DEEP_SPEECH) + pf.remove_file(video_file) pf.process_movement(video_file, out_path, args.dbm_group, r_config, DLIB_SHAPE_MODEL) @@ -79,6 +82,7 @@ def process_raw_audio_file(args, s_config, r_config): out_path = os.path.join(args.output_path, 'raw_variables') pf.process_acoustic(audio_file[0], out_path, args.dbm_group, r_config) + pf.process_nlp(audio_file[0], out_path, args.dbm_group, r_config, DEEP_SPEECH) else: logger.info('Enter correct audio(*.wav) file path.') @@ -130,6 +134,8 @@ def process_raw_audio_dir(args, s_config, r_config): out_path = os.path.join(args.output_path, 'raw_variables') pf.process_acoustic(audio, out_path, args.dbm_group, r_config) + pf.process_nlp(audio, out_path, args.dbm_group, r_config, DEEP_SPEECH) + except Exception as e: logger.error('Failed to process wav file.') diff --git a/process_dbm.sh b/process_dbm.sh index cad71b7f..361d2aed 100644 --- a/process_dbm.sh +++ b/process_dbm.sh @@ -55,6 +55,9 @@ fi if [[ $dbm_group == *"movement"* ]]; then dbm_new="$dbm_new movement" fi +if [[ $dbm_group == *"nlp"* ]]; then + dbm_new="$dbm_new nlp" +fi #docker commands to run container docker create -ti --name dbm_container dbm bash diff --git a/requirements.txt b/requirements.txt index 01f03057..1ecea7d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,4 @@ more_itertools scipy==1.2.0 pyyaml pydub +deepspeech \ No newline at end of file diff --git a/resources/features/raw_feature.yml b/resources/features/raw_feature.yml index 982bf631..f8b00883 100644 --- a/resources/features/raw_feature.yml +++ b/resources/features/raw_feature.yml @@ -196,3 +196,6 @@ raw_feature: mov_Hpose_Yaw: mov_hposeyaw mov_Hpose_Roll: mov_hposeroll mov_Hpose_Dist: mov_hposedist + + #NLP markers + nlp_transcribe: nlp_transcribe