diff --git a/opendbm/dbm_lib/dbm_features/derived_features/__init__.py b/opendbm/dbm_lib/dbm_features/derived_features/__init__.py index 4a214911..5b391f0f 100644 --- a/opendbm/dbm_lib/dbm_features/derived_features/__init__.py +++ b/opendbm/dbm_lib/dbm_features/derived_features/__init__.py @@ -4,7 +4,4 @@ project_name: DBM created: 2020-20-07 """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - +from __future__ import absolute_import, division, print_function diff --git a/opendbm/dbm_lib/dbm_features/derived_features/derive.py b/opendbm/dbm_lib/dbm_features/derived_features/derive.py index 55f699c1..597532ba 100644 --- a/opendbm/dbm_lib/dbm_features/derived_features/derive.py +++ b/opendbm/dbm_lib/dbm_features/derived_features/derive.py @@ -4,15 +4,17 @@ project_name: DBM created: 2020-20-07 """ -import pandas as pd -import numpy as np import glob -import os import logging +import os from datetime import datetime +import numpy as np +import pandas as pd + logging.basicConfig(level=logging.INFO) -logger=logging.getLogger() +logger = logging.getLogger() + def dict_to_df(feature_dict, file): """ @@ -21,92 +23,106 @@ def dict_to_df(feature_dict, file): final_dict = {k: v for d in feature_dict for k, v in d.items()} feature_df = pd.DataFrame([final_dict]) - feature_df['Filename'] = file - + feature_df["Filename"] = file + return feature_df + def save_derive_output(df_list, out_loc): """ Saving derive variable output """ try: - if len(df_list)>0: + if len(df_list) > 0: df = df_list[0] - - file_name = os.path.join(out_loc, 'derived_output.csv') + + file_name = os.path.join(out_loc, "derived_output.csv") if not os.path.exists(out_loc): - + os.makedirs(out_loc) df.to_csv(file_name, index=False) - + except Exception as e: - logger.error('Failed to save derived variable csv') + e + logger.error("Failed to save derived variable csv") + def feature_output(df_fea, exp_var, cal_type): """ - Computing mean value of dataframe columns + Computing mean value of dataframe columns """ exp_val = np.nan try: - + df_ = df_fea[exp_var].astype(float).copy() df_ = df_.dropna().reset_index(drop=True) - - if len(df_)>0: - if cal_type == 'mean': - exp_val = df_.mean(axis = 0, skipna = True) + if len(df_) > 0: - elif cal_type == 'std': - exp_val = df_.std(axis = 0, skipna = True) + if cal_type == "mean": + exp_val = df_.mean(axis=0, skipna=True) - elif cal_type == 'count':#use case for eye blink - exp_var = 'mov_blink' - exp_val = (len(df_)/df_[0])*60 + elif cal_type == "std": + exp_val = df_.std(axis=0, skipna=True) - elif cal_type == 'pct': - if len(df_)>0: - exp_val = len(df_[df_ > 0])/len(df_) + elif cal_type == "count": # use case for eye blink + exp_var = "mov_blink" + exp_val = (len(df_) / df_[0]) * 60 - elif cal_type == 'range': + elif cal_type == "pct": + if len(df_) > 0: + exp_val = len(df_[df_ > 0]) / len(df_) + + elif cal_type == "range": exp_val = max(df_) - min(df_) except Exception as e: - logger.error('Failed to compute calculation: {}'.format(e)) + logger.error("Failed to compute calculation: {}".format(e)) pass - - var_name = exp_var + '_' + cal_type + + var_name = exp_var + "_" + cal_type exp_val = float("{0:.4f}".format(exp_val)) var_val = (var_name, exp_val) - + return var_val + def cal_type_dict(var_df, raw_df, d_cfg_Obj, r_cfg_Obj): - - var_name = str(var_df['var_id']) - - #fetching key based on variable name from raw config + + var_name = str(var_df["var_id"]) + + # fetching key based on variable name from raw config var_key = list(r_cfg_Obj.keys())[list(r_cfg_Obj.values()).index(var_name)] - cal_type = d_cfg_Obj[var_key] # calculation type from config - + cal_type = d_cfg_Obj[var_key] # calculation type from config + var_val = [feature_output(raw_df, var_name, cal) for cal in cal_type] var_val_dict = dict(var_val) - + return var_val_dict + def compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj): """ Computing features """ - #Variable data frame for each feature group - var_df = pd.DataFrame(var_cols,columns=['var_id']) + # Variable data frame for each feature group + var_df = pd.DataFrame(var_cols, columns=["var_id"]) feature_dict = {} - - if len(raw_df)>0: - feature_dict = var_df.apply(cal_type_dict, args=(raw_df, d_cfg_Obj, r_cfg_Obj, ), axis=1) + + if len(raw_df) > 0: + feature_dict = var_df.apply( + cal_type_dict, + args=( + raw_df, + d_cfg_Obj, + r_cfg_Obj, + ), + axis=1, + ) return feature_dict - + + def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature): """ Calculating derived variable @@ -114,51 +130,53 @@ def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature): df_list = [] df = pd.DataFrame() for file in input_file: - + file_name, _ = os.path.splitext(os.path.basename(file)) input_loc = os.path.join(input_dir, file_name) - + var_cols = [r_cfg_Obj[x] for x in d_cfg_Obj[feature]] - - fea_loc = d_cfg_Obj[feature + '_LOC'] - fea_res = glob.glob(os.path.join(input_loc, '*/*/*' + fea_loc + '.csv')) - - if len(fea_res)>0: + + fea_loc = d_cfg_Obj[feature + "_LOC"] + fea_res = glob.glob(os.path.join(input_loc, "*/*/*" + fea_loc + ".csv")) + + if len(fea_res) > 0: raw_df = pd.read_csv(fea_res[0]) feature_dict = compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj) - - if len(feature_dict)>0: + + if len(feature_dict) > 0: feature_df = dict_to_df(feature_dict, file) df_list.append(feature_df) - - if len(df_list)>0: + + if len(df_list) > 0: df = pd.concat(df_list, ignore_index=True) return df + def run_derive(input_file, input_dir, output_dir, r_config, d_config): """ Processing derived variable """ - d_cfg_Obj = d_config.base_derive['derive_feature'] - r_cfg_Obj = r_config.base_raw['raw_feature'] - feature_group = d_cfg_Obj['FEATURE_GROUP'] - - #Iterating over feature group + d_cfg_Obj = d_config.base_derive["derive_feature"] + r_cfg_Obj = r_config.base_raw["raw_feature"] + feature_group = d_cfg_Obj["FEATURE_GROUP"] + + # Iterating over feature group df_list = [] for feature in feature_group: try: - + df_fea = calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature) - if len(df_fea)>0: - + if len(df_fea) > 0: + if len(df_list) == 0: df_list.append(df_fea) else: - result = pd.merge(df_list[0], df_fea, how='outer', on=['Filename']) + result = pd.merge(df_list[0], df_fea, how="outer", on=["Filename"]) df_list = [result] - + except Exception as e: - logger.error('Failed to process derived variables {}'.format(feature)) - + e + logger.error("Failed to process derived variables {}".format(feature)) + logger.info("Saving derived variable output...") - save_derive_output(df_list, output_dir) \ No newline at end of file + save_derive_output(df_list, output_dir)