code refactoring

This commit is contained in:
jordi.hasianta
2022-09-15 20:27:53 +07:00
parent 6a609550f2
commit c6cd54c376
2 changed files with 87 additions and 72 deletions

View File

@@ -4,7 +4,4 @@ project_name: DBM
created: 2020-20-07 created: 2020-20-07
""" """
from __future__ import absolute_import from __future__ import absolute_import, division, print_function
from __future__ import division
from __future__ import print_function

View File

@@ -4,15 +4,17 @@ project_name: DBM
created: 2020-20-07 created: 2020-20-07
""" """
import pandas as pd
import numpy as np
import glob import glob
import os
import logging import logging
import os
from datetime import datetime from datetime import datetime
import numpy as np
import pandas as pd
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger=logging.getLogger() logger = logging.getLogger()
def dict_to_df(feature_dict, file): def dict_to_df(feature_dict, file):
""" """
@@ -21,92 +23,106 @@ def dict_to_df(feature_dict, file):
final_dict = {k: v for d in feature_dict for k, v in d.items()} final_dict = {k: v for d in feature_dict for k, v in d.items()}
feature_df = pd.DataFrame([final_dict]) feature_df = pd.DataFrame([final_dict])
feature_df['Filename'] = file feature_df["Filename"] = file
return feature_df return feature_df
def save_derive_output(df_list, out_loc): def save_derive_output(df_list, out_loc):
""" """
Saving derive variable output Saving derive variable output
""" """
try: try:
if len(df_list)>0: if len(df_list) > 0:
df = df_list[0] df = df_list[0]
file_name = os.path.join(out_loc, 'derived_output.csv') file_name = os.path.join(out_loc, "derived_output.csv")
if not os.path.exists(out_loc): if not os.path.exists(out_loc):
os.makedirs(out_loc) os.makedirs(out_loc)
df.to_csv(file_name, index=False) df.to_csv(file_name, index=False)
except Exception as e: except Exception as e:
logger.error('Failed to save derived variable csv') e
logger.error("Failed to save derived variable csv")
def feature_output(df_fea, exp_var, cal_type): def feature_output(df_fea, exp_var, cal_type):
""" """
Computing mean value of dataframe columns Computing mean value of dataframe columns
""" """
exp_val = np.nan exp_val = np.nan
try: try:
df_ = df_fea[exp_var].astype(float).copy() df_ = df_fea[exp_var].astype(float).copy()
df_ = df_.dropna().reset_index(drop=True) df_ = df_.dropna().reset_index(drop=True)
if len(df_)>0:
if cal_type == 'mean': if len(df_) > 0:
exp_val = df_.mean(axis = 0, skipna = True)
elif cal_type == 'std': if cal_type == "mean":
exp_val = df_.std(axis = 0, skipna = True) exp_val = df_.mean(axis=0, skipna=True)
elif cal_type == 'count':#use case for eye blink elif cal_type == "std":
exp_var = 'mov_blink' exp_val = df_.std(axis=0, skipna=True)
exp_val = (len(df_)/df_[0])*60
elif cal_type == 'pct': elif cal_type == "count": # use case for eye blink
if len(df_)>0: exp_var = "mov_blink"
exp_val = len(df_[df_ > 0])/len(df_) exp_val = (len(df_) / df_[0]) * 60
elif cal_type == 'range': elif cal_type == "pct":
if len(df_) > 0:
exp_val = len(df_[df_ > 0]) / len(df_)
elif cal_type == "range":
exp_val = max(df_) - min(df_) exp_val = max(df_) - min(df_)
except Exception as e: except Exception as e:
logger.error('Failed to compute calculation: {}'.format(e)) logger.error("Failed to compute calculation: {}".format(e))
pass pass
var_name = exp_var + '_' + cal_type var_name = exp_var + "_" + cal_type
exp_val = float("{0:.4f}".format(exp_val)) exp_val = float("{0:.4f}".format(exp_val))
var_val = (var_name, exp_val) var_val = (var_name, exp_val)
return var_val return var_val
def cal_type_dict(var_df, raw_df, d_cfg_Obj, r_cfg_Obj): def cal_type_dict(var_df, raw_df, d_cfg_Obj, r_cfg_Obj):
var_name = str(var_df['var_id']) var_name = str(var_df["var_id"])
#fetching key based on variable name from raw config # fetching key based on variable name from raw config
var_key = list(r_cfg_Obj.keys())[list(r_cfg_Obj.values()).index(var_name)] var_key = list(r_cfg_Obj.keys())[list(r_cfg_Obj.values()).index(var_name)]
cal_type = d_cfg_Obj[var_key] # calculation type from config cal_type = d_cfg_Obj[var_key] # calculation type from config
var_val = [feature_output(raw_df, var_name, cal) for cal in cal_type] var_val = [feature_output(raw_df, var_name, cal) for cal in cal_type]
var_val_dict = dict(var_val) var_val_dict = dict(var_val)
return var_val_dict return var_val_dict
def compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj): def compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj):
""" """
Computing features Computing features
""" """
#Variable data frame for each feature group # Variable data frame for each feature group
var_df = pd.DataFrame(var_cols,columns=['var_id']) var_df = pd.DataFrame(var_cols, columns=["var_id"])
feature_dict = {} feature_dict = {}
if len(raw_df)>0: if len(raw_df) > 0:
feature_dict = var_df.apply(cal_type_dict, args=(raw_df, d_cfg_Obj, r_cfg_Obj, ), axis=1) feature_dict = var_df.apply(
cal_type_dict,
args=(
raw_df,
d_cfg_Obj,
r_cfg_Obj,
),
axis=1,
)
return feature_dict return feature_dict
def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature): def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature):
""" """
Calculating derived variable Calculating derived variable
@@ -114,51 +130,53 @@ def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature):
df_list = [] df_list = []
df = pd.DataFrame() df = pd.DataFrame()
for file in input_file: for file in input_file:
file_name, _ = os.path.splitext(os.path.basename(file)) file_name, _ = os.path.splitext(os.path.basename(file))
input_loc = os.path.join(input_dir, file_name) input_loc = os.path.join(input_dir, file_name)
var_cols = [r_cfg_Obj[x] for x in d_cfg_Obj[feature]] var_cols = [r_cfg_Obj[x] for x in d_cfg_Obj[feature]]
fea_loc = d_cfg_Obj[feature + '_LOC'] fea_loc = d_cfg_Obj[feature + "_LOC"]
fea_res = glob.glob(os.path.join(input_loc, '*/*/*' + fea_loc + '.csv')) fea_res = glob.glob(os.path.join(input_loc, "*/*/*" + fea_loc + ".csv"))
if len(fea_res)>0: if len(fea_res) > 0:
raw_df = pd.read_csv(fea_res[0]) raw_df = pd.read_csv(fea_res[0])
feature_dict = compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj) feature_dict = compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj)
if len(feature_dict)>0: if len(feature_dict) > 0:
feature_df = dict_to_df(feature_dict, file) feature_df = dict_to_df(feature_dict, file)
df_list.append(feature_df) df_list.append(feature_df)
if len(df_list)>0: if len(df_list) > 0:
df = pd.concat(df_list, ignore_index=True) df = pd.concat(df_list, ignore_index=True)
return df return df
def run_derive(input_file, input_dir, output_dir, r_config, d_config): def run_derive(input_file, input_dir, output_dir, r_config, d_config):
""" """
Processing derived variable Processing derived variable
""" """
d_cfg_Obj = d_config.base_derive['derive_feature'] d_cfg_Obj = d_config.base_derive["derive_feature"]
r_cfg_Obj = r_config.base_raw['raw_feature'] r_cfg_Obj = r_config.base_raw["raw_feature"]
feature_group = d_cfg_Obj['FEATURE_GROUP'] feature_group = d_cfg_Obj["FEATURE_GROUP"]
#Iterating over feature group # Iterating over feature group
df_list = [] df_list = []
for feature in feature_group: for feature in feature_group:
try: try:
df_fea = calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature) df_fea = calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature)
if len(df_fea)>0: if len(df_fea) > 0:
if len(df_list) == 0: if len(df_list) == 0:
df_list.append(df_fea) df_list.append(df_fea)
else: else:
result = pd.merge(df_list[0], df_fea, how='outer', on=['Filename']) result = pd.merge(df_list[0], df_fea, how="outer", on=["Filename"])
df_list = [result] df_list = [result]
except Exception as e: except Exception as e:
logger.error('Failed to process derived variables {}'.format(feature)) e
logger.error("Failed to process derived variables {}".format(feature))
logger.info("Saving derived variable output...") logger.info("Saving derived variable output...")
save_derive_output(df_list, output_dir) save_derive_output(df_list, output_dir)