code refactoring
This commit is contained in:
@@ -4,7 +4,4 @@ project_name: DBM
|
|||||||
created: 2020-20-07
|
created: 2020-20-07
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import, division, print_function
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,15 +4,17 @@ project_name: DBM
|
|||||||
created: 2020-20-07
|
created: 2020-20-07
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import glob
|
import glob
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger=logging.getLogger()
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
def dict_to_df(feature_dict, file):
|
def dict_to_df(feature_dict, file):
|
||||||
"""
|
"""
|
||||||
@@ -21,30 +23,33 @@ def dict_to_df(feature_dict, file):
|
|||||||
final_dict = {k: v for d in feature_dict for k, v in d.items()}
|
final_dict = {k: v for d in feature_dict for k, v in d.items()}
|
||||||
|
|
||||||
feature_df = pd.DataFrame([final_dict])
|
feature_df = pd.DataFrame([final_dict])
|
||||||
feature_df['Filename'] = file
|
feature_df["Filename"] = file
|
||||||
|
|
||||||
return feature_df
|
return feature_df
|
||||||
|
|
||||||
|
|
||||||
def save_derive_output(df_list, out_loc):
|
def save_derive_output(df_list, out_loc):
|
||||||
"""
|
"""
|
||||||
Saving derive variable output
|
Saving derive variable output
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
if len(df_list)>0:
|
if len(df_list) > 0:
|
||||||
df = df_list[0]
|
df = df_list[0]
|
||||||
|
|
||||||
file_name = os.path.join(out_loc, 'derived_output.csv')
|
file_name = os.path.join(out_loc, "derived_output.csv")
|
||||||
if not os.path.exists(out_loc):
|
if not os.path.exists(out_loc):
|
||||||
|
|
||||||
os.makedirs(out_loc)
|
os.makedirs(out_loc)
|
||||||
df.to_csv(file_name, index=False)
|
df.to_csv(file_name, index=False)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error('Failed to save derived variable csv')
|
e
|
||||||
|
logger.error("Failed to save derived variable csv")
|
||||||
|
|
||||||
|
|
||||||
def feature_output(df_fea, exp_var, cal_type):
|
def feature_output(df_fea, exp_var, cal_type):
|
||||||
"""
|
"""
|
||||||
Computing mean value of dataframe columns
|
Computing mean value of dataframe columns
|
||||||
"""
|
"""
|
||||||
exp_val = np.nan
|
exp_val = np.nan
|
||||||
try:
|
try:
|
||||||
@@ -52,61 +57,72 @@ def feature_output(df_fea, exp_var, cal_type):
|
|||||||
df_ = df_fea[exp_var].astype(float).copy()
|
df_ = df_fea[exp_var].astype(float).copy()
|
||||||
df_ = df_.dropna().reset_index(drop=True)
|
df_ = df_.dropna().reset_index(drop=True)
|
||||||
|
|
||||||
if len(df_)>0:
|
if len(df_) > 0:
|
||||||
|
|
||||||
if cal_type == 'mean':
|
if cal_type == "mean":
|
||||||
exp_val = df_.mean(axis = 0, skipna = True)
|
exp_val = df_.mean(axis=0, skipna=True)
|
||||||
|
|
||||||
elif cal_type == 'std':
|
elif cal_type == "std":
|
||||||
exp_val = df_.std(axis = 0, skipna = True)
|
exp_val = df_.std(axis=0, skipna=True)
|
||||||
|
|
||||||
elif cal_type == 'count':#use case for eye blink
|
elif cal_type == "count": # use case for eye blink
|
||||||
exp_var = 'mov_blink'
|
exp_var = "mov_blink"
|
||||||
exp_val = (len(df_)/df_[0])*60
|
exp_val = (len(df_) / df_[0]) * 60
|
||||||
|
|
||||||
elif cal_type == 'pct':
|
elif cal_type == "pct":
|
||||||
if len(df_)>0:
|
if len(df_) > 0:
|
||||||
exp_val = len(df_[df_ > 0])/len(df_)
|
exp_val = len(df_[df_ > 0]) / len(df_)
|
||||||
|
|
||||||
elif cal_type == 'range':
|
elif cal_type == "range":
|
||||||
exp_val = max(df_) - min(df_)
|
exp_val = max(df_) - min(df_)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error('Failed to compute calculation: {}'.format(e))
|
logger.error("Failed to compute calculation: {}".format(e))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
var_name = exp_var + '_' + cal_type
|
var_name = exp_var + "_" + cal_type
|
||||||
exp_val = float("{0:.4f}".format(exp_val))
|
exp_val = float("{0:.4f}".format(exp_val))
|
||||||
var_val = (var_name, exp_val)
|
var_val = (var_name, exp_val)
|
||||||
|
|
||||||
return var_val
|
return var_val
|
||||||
|
|
||||||
|
|
||||||
def cal_type_dict(var_df, raw_df, d_cfg_Obj, r_cfg_Obj):
|
def cal_type_dict(var_df, raw_df, d_cfg_Obj, r_cfg_Obj):
|
||||||
|
|
||||||
var_name = str(var_df['var_id'])
|
var_name = str(var_df["var_id"])
|
||||||
|
|
||||||
#fetching key based on variable name from raw config
|
# fetching key based on variable name from raw config
|
||||||
var_key = list(r_cfg_Obj.keys())[list(r_cfg_Obj.values()).index(var_name)]
|
var_key = list(r_cfg_Obj.keys())[list(r_cfg_Obj.values()).index(var_name)]
|
||||||
cal_type = d_cfg_Obj[var_key] # calculation type from config
|
cal_type = d_cfg_Obj[var_key] # calculation type from config
|
||||||
|
|
||||||
var_val = [feature_output(raw_df, var_name, cal) for cal in cal_type]
|
var_val = [feature_output(raw_df, var_name, cal) for cal in cal_type]
|
||||||
var_val_dict = dict(var_val)
|
var_val_dict = dict(var_val)
|
||||||
|
|
||||||
return var_val_dict
|
return var_val_dict
|
||||||
|
|
||||||
|
|
||||||
def compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj):
|
def compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj):
|
||||||
"""
|
"""
|
||||||
Computing features
|
Computing features
|
||||||
"""
|
"""
|
||||||
#Variable data frame for each feature group
|
# Variable data frame for each feature group
|
||||||
var_df = pd.DataFrame(var_cols,columns=['var_id'])
|
var_df = pd.DataFrame(var_cols, columns=["var_id"])
|
||||||
feature_dict = {}
|
feature_dict = {}
|
||||||
|
|
||||||
if len(raw_df)>0:
|
if len(raw_df) > 0:
|
||||||
feature_dict = var_df.apply(cal_type_dict, args=(raw_df, d_cfg_Obj, r_cfg_Obj, ), axis=1)
|
feature_dict = var_df.apply(
|
||||||
|
cal_type_dict,
|
||||||
|
args=(
|
||||||
|
raw_df,
|
||||||
|
d_cfg_Obj,
|
||||||
|
r_cfg_Obj,
|
||||||
|
),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
return feature_dict
|
return feature_dict
|
||||||
|
|
||||||
|
|
||||||
def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature):
|
def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature):
|
||||||
"""
|
"""
|
||||||
Calculating derived variable
|
Calculating derived variable
|
||||||
@@ -120,45 +136,47 @@ def calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature):
|
|||||||
|
|
||||||
var_cols = [r_cfg_Obj[x] for x in d_cfg_Obj[feature]]
|
var_cols = [r_cfg_Obj[x] for x in d_cfg_Obj[feature]]
|
||||||
|
|
||||||
fea_loc = d_cfg_Obj[feature + '_LOC']
|
fea_loc = d_cfg_Obj[feature + "_LOC"]
|
||||||
fea_res = glob.glob(os.path.join(input_loc, '*/*/*' + fea_loc + '.csv'))
|
fea_res = glob.glob(os.path.join(input_loc, "*/*/*" + fea_loc + ".csv"))
|
||||||
|
|
||||||
if len(fea_res)>0:
|
if len(fea_res) > 0:
|
||||||
raw_df = pd.read_csv(fea_res[0])
|
raw_df = pd.read_csv(fea_res[0])
|
||||||
feature_dict = compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj)
|
feature_dict = compute_feature(raw_df, var_cols, d_cfg_Obj, r_cfg_Obj)
|
||||||
|
|
||||||
if len(feature_dict)>0:
|
if len(feature_dict) > 0:
|
||||||
feature_df = dict_to_df(feature_dict, file)
|
feature_df = dict_to_df(feature_dict, file)
|
||||||
df_list.append(feature_df)
|
df_list.append(feature_df)
|
||||||
|
|
||||||
if len(df_list)>0:
|
if len(df_list) > 0:
|
||||||
df = pd.concat(df_list, ignore_index=True)
|
df = pd.concat(df_list, ignore_index=True)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def run_derive(input_file, input_dir, output_dir, r_config, d_config):
|
def run_derive(input_file, input_dir, output_dir, r_config, d_config):
|
||||||
"""
|
"""
|
||||||
Processing derived variable
|
Processing derived variable
|
||||||
"""
|
"""
|
||||||
d_cfg_Obj = d_config.base_derive['derive_feature']
|
d_cfg_Obj = d_config.base_derive["derive_feature"]
|
||||||
r_cfg_Obj = r_config.base_raw['raw_feature']
|
r_cfg_Obj = r_config.base_raw["raw_feature"]
|
||||||
feature_group = d_cfg_Obj['FEATURE_GROUP']
|
feature_group = d_cfg_Obj["FEATURE_GROUP"]
|
||||||
|
|
||||||
#Iterating over feature group
|
# Iterating over feature group
|
||||||
df_list = []
|
df_list = []
|
||||||
for feature in feature_group:
|
for feature in feature_group:
|
||||||
try:
|
try:
|
||||||
|
|
||||||
df_fea = calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature)
|
df_fea = calc_derive(input_file, input_dir, r_cfg_Obj, d_cfg_Obj, feature)
|
||||||
if len(df_fea)>0:
|
if len(df_fea) > 0:
|
||||||
|
|
||||||
if len(df_list) == 0:
|
if len(df_list) == 0:
|
||||||
df_list.append(df_fea)
|
df_list.append(df_fea)
|
||||||
else:
|
else:
|
||||||
result = pd.merge(df_list[0], df_fea, how='outer', on=['Filename'])
|
result = pd.merge(df_list[0], df_fea, how="outer", on=["Filename"])
|
||||||
df_list = [result]
|
df_list = [result]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error('Failed to process derived variables {}'.format(feature))
|
e
|
||||||
|
logger.error("Failed to process derived variables {}".format(feature))
|
||||||
|
|
||||||
logger.info("Saving derived variable output...")
|
logger.info("Saving derived variable output...")
|
||||||
save_derive_output(df_list, output_dir)
|
save_derive_output(df_list, output_dir)
|
||||||
Reference in New Issue
Block a user