code refactoring

This commit is contained in:
jordi.hasianta
2022-09-15 20:27:53 +07:00
parent 6a609550f2
commit c6cd54c376
2 changed files with 87 additions and 72 deletions

View File

@@ -4,7 +4,4 @@ project_name: DBM
created: 2020-20-07
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import, division, print_function

View File

@@ -4,16 +4,18 @@ project_name: DBM
created: 2020-20-07
"""
import pandas as pd
import numpy as np
import glob
import os
import logging
import os
from datetime import datetime
import numpy as np
import pandas as pd
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def dict_to_df(feature_dict, file):
"""
Converting ditionary to dataframe
@@ -21,10 +23,11 @@ def dict_to_df(feature_dict, file):
final_dict = {k: v for d in feature_dict for k, v in d.items()}
feature_df = pd.DataFrame([final_dict])
feature_df['Filename'] = file
feature_df["Filename"] = file
return feature_df
def save_derive_output(df_list, out_loc):
"""
Saving derive variable output
@@ -33,14 +36,16 @@ def save_derive_output(df_list, out_loc):
if len(df_list) > 0:
df = df_list[0]
file_name = os.path.join(out_loc, 'derived_output.csv')
file_name = os.path.join(out_loc, "derived_output.csv")
if not os.path.exists(out_loc):
os.makedirs(out_loc)
df.to_csv(file_name, index=False)
except Exception as e:
logger.error('Failed to save derived variable csv')
e
logger.error("Failed to save derived variable csv")
def feature_output(df_fea, exp_var, cal_type):
"""
@@ -54,36 +59,37 @@ def feature_output(df_fea, exp_var, cal_type):
if len(df_) > 0:
if cal_type == 'mean':
if cal_type == "mean":
exp_val = df_.mean(axis=0, skipna=True)
elif cal_type == 'std':
elif cal_type == "std":
exp_val = df_.std(axis=0, skipna=True)
elif cal_type == 'count':#use case for eye blink
exp_var = 'mov_blink'
elif cal_type == "count": # use case for eye blink
exp_var = "mov_blink"
exp_val = (len(df_) / df_[0]) * 60
elif cal_type == 'pct':
elif cal_type == "pct":
if len(df_) > 0:
exp_val = len(df_[df_ > 0]) / len(df_)
elif cal_type == 'range':
elif cal_type == "range":
exp_val = max(df_) - min(df_)
except Exception as e:
logger.error('Failed to compute calculation: {}'.format(e))
logger.error("Failed to compute calculation: {}".format(e))
pass
var_name = exp_var + '_' + cal_type
var_name = exp_var + "_" + cal_type
exp_val = float("{0:.4f}".format(exp_val))
var_val = (var_name, exp_val)
return var_val
def cal_type_dict(var_df, raw_df, d_cfg_Obj, r_cfg_Obj):
var_name = str(var_df['var_id'])
var_name = str(var_df["var_id"])
# fetching key based on variable name from raw config
var_key = list(r_cfg_Obj.keys())[list(r_cfg_Obj.values()).index(var_name)]
@@ -94,19 +100,29 @@ def cal_type_dict(var_df, raw_df, d_cfg_Obj, r_cfg_Obj):
return var_val_dict
def compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj):
"""
Computing features
"""
# Variable data frame for each feature group
var_df = pd.DataFrame(var_cols,columns=['var_id'])
var_df = pd.DataFrame(var_cols, columns=["var_id"])
feature_dict = {}
if len(raw_df) > 0:
feature_dict = var_df.apply(cal_type_dict, args=(raw_df, d_cfg_Obj, r_cfg_Obj, ), axis=1)
feature_dict = var_df.apply(
cal_type_dict,
args=(
raw_df,
d_cfg_Obj,
r_cfg_Obj,
),
axis=1,
)
return feature_dict
def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature):
"""
Calculating derived variable
@@ -120,8 +136,8 @@ def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature):
var_cols = [r_cfg_Obj[x] for x in d_cfg_Obj[feature]]
fea_loc = d_cfg_Obj[feature + '_LOC']
fea_res = glob.glob(os.path.join(input_loc, '*/*/*' + fea_loc + '.csv'))
fea_loc = d_cfg_Obj[feature + "_LOC"]
fea_res = glob.glob(os.path.join(input_loc, "*/*/*" + fea_loc + ".csv"))
if len(fea_res) > 0:
raw_df = pd.read_csv(fea_res[0])
@@ -135,13 +151,14 @@ def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature):
df = pd.concat(df_list, ignore_index=True)
return df
def run_derive(input_file, input_dir, output_dir, r_config, d_config):
"""
Processing derived variable
"""
d_cfg_Obj = d_config.base_derive['derive_feature']
r_cfg_Obj = r_config.base_raw['raw_feature']
feature_group = d_cfg_Obj['FEATURE_GROUP']
d_cfg_Obj = d_config.base_derive["derive_feature"]
r_cfg_Obj = r_config.base_raw["raw_feature"]
feature_group = d_cfg_Obj["FEATURE_GROUP"]
# Iterating over feature group
df_list = []
@@ -154,11 +171,12 @@ def run_derive(input_file, input_dir, output_dir, r_config, d_config):
if len(df_list) == 0:
df_list.append(df_fea)
else:
result = pd.merge(df_list[0], df_fea, how='outer', on=['Filename'])
result = pd.merge(df_list[0], df_fea, how="outer", on=["Filename"])
df_list = [result]
except Exception as e:
logger.error('Failed to process derived variables {}'.format(feature))
e
logger.error("Failed to process derived variables {}".format(feature))
logger.info("Saving derived variable output...")
save_derive_output(df_list, output_dir)