# -*- coding: utf-8 -*-
"""
Created on Mon Mar 26 20:16:29 2018

@author: linda
"""
import lightgbm as lgb
import pandas as pd
import numpy as np
import xlsxwriter
from lightGBM_function import lightGBM
## importance score assignment
input = 'F:\Project\Project9-DiseasesDiscrimination\step3WGS-01\Sample-Selectedstrain-matrix.csv'

data = pd.read_csv(input)

X = data[data.columns[1:-1]]
y = data[data.columns[-1]]

lgb_train = lgb.Dataset(X, y)
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        #'metric': {'binary_logloss', 'auc'},
        'metric': {'binary_logloss'},
        'metric_freq': 1,
        'is_training_metric': 'true',
        'max_bin': 255,
        'num_trees': 100,
        'learning_rate': 0.1,
        'num_leaves': 20,
        'tree_learner': 'serial',
        'feature_fraction': 0.9,
        'bagging_freq': 5,
        'bagging_fraction': 0.8,
        #'min_data_in_leaf': 50,
        #'min_sum_hessian_in_leaf': 5.0,
        #'is_enable_sparse': 'true',
        #'use_two_round_loading': 'false',
        'verbose': 0
}
print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=50,
                #valid_sets=lgb_eval,
                #early_stopping_rounds=5
                )

#rfc_model = RFC(n_estimators=500)
#rfc_model.fit(X,y)
df_feature = pd.DataFrame({'strain': list(X.columns), 'weight': gbm.feature_importance()})
df_feature1 = df_feature.sort_values(by=['weight'], ascending = False)
df_feature2 = df_feature1.reset_index(drop=True)
f_imptc = gbm.feature_importance()
## increasinglt select features according to FI score 
## and evaluate the modle performance with increcement number of features
List=[5,10,15,20,25,30,35,40,45,50,55]
N=[0,1,2,3,4]
n=0
workbook = xlsxwriter.Workbook('PreTrain_Result_01.xlsx')
worksheet1 = workbook.add_worksheet('AUCAVE')
worksheet2 = workbook.add_worksheet('APAVE')
worksheet3 = workbook.add_worksheet('Feature_importance_score')
while n<len(N):
    k=0
    while k<len(List):
        temp = X[df_feature2.strain[0:List[k]]]
        temp.insert(0, 'tag', list(y))
        temp=np.array(temp)
        auc_AP = lightGBM(temp)
        print('The average AUC of'+str(List[k])+':'+str(auc_AP[0]))
        print('The average AP of'+str(List[k])+':'+str(auc_AP[1]))
        worksheet1.write(n*len(List)+k,0,List[k])
        worksheet1.write(n*len(List)+k,1,auc_AP[0])
        worksheet2.write(n*len(List)+k,0,List[k])
        worksheet2.write(n*len(List)+k,1,auc_AP[1])
        #temp.to_csv(str(List[k])+'-features.csv',index=False)
        k=k+1
    n=n+1

df_features = pd.DataFrame({'strain': list(X.columns)})
df_features = df_features['strain']

i=0
while i<len(f_imptc):
    worksheet3.write_string(i,0,df_features[i])
    worksheet3.write(i,1,f_imptc[i])
    i=i+1
workbook.close()

# Finally, we found out thay the top 15 features contributes to the best model performance
X_sub = X[df_feature2.strain[0:30]]
X_sub.insert(0, 'tag', list(y))
X_sub.to_csv('F:\Project\Project9-DiseasesDiscrimination\step3WGS-01\/' + str(30)+'-Selected-features.csv',index=False)