
# coding: utf-8

# In[5]:


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score, roc_auc_score
import xgboost as xgb
from sklearn.cross_validation import KFold
import lightgbm as lgb


# In[10]:


print('Load data...')
input = '/Users/zhouman/Desktop/LAB/IBD/new_version/Sample-Selectedstrain-matrix_01.csv'
data = pd.read_csv(input)


# In[376]:


x = data[data.columns[1:-1]]
y = data[data.columns[-1]]
X = np.array(X)
Y = np.array(y)


# In[359]:


index = np.arange(X.shape[0])
np.random.shuffle(index) 


# In[360]:


X_train = X[index[0:round(0.8*X.shape[0])], :]
X_test = X[index[round(0.8*X.shape[0]):], :]
Y_train = Y[index[0:round(0.8*Y.shape[0])]]
Y_test = Y[index[round(0.8*Y.shape[0]):]]
#Y_train = keras.utils.to_categorical(Y[index[0:round(0.8*Y.shape[0])]], 3)
#Y_test = keras.utils.to_categorical(Y[index[round(0.8*Y.shape[0]):]], 3)


# In[361]:


X_train.shape, Y_train.shape


# In[362]:


lr_model = LR(C=1000000000,penalty='l2')
lr_model.fit(X_train, Y_train)
lr_model.score(X_test, Y_test)


# In[363]:


rfc_model = RFC(n_estimators=500)
rfc_model.fit(X_train, Y_train)
rfc_model.score(X_test, Y_test)


# In[364]:


gbc_model = GBC(n_estimators=500,max_depth=64)
gbc_model.fit(X_train, Y_train)
gbc_model.score(X_test, Y_test)


# In[365]:


svc_model = SVC(C=100000, gamma=100)
svc_model.fit(X_train, Y_train)
svc_model.score(X_test, Y_test)


# In[366]:


param = {'max_depth':5, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
num_round = 20
dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)


# In[367]:


count = 0.0
for i in range(Y_test.shape[0]):
    if np.round(preds)[i] == Y_test[i]:
        count += 1
print(count/Y_test.shape[0])


# In[6]:


def cross_validation(model, X, Y, n_fold):
    auPRC = np.zeros([n_fold])
    auROC = np.zeros([n_fold])
    kf = KFold(X.shape[0], n_fold, shuffle=True)
    itr = 0
    for train_index, val_index in kf:
        X_train = X[train_index, :]
        X_val = X[val_index, :]
        Y_train = Y[train_index]
        Y_val = Y[val_index]
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_val)
        auPRC[itr] = average_precision_score(Y_val, Y_pred)
        auROC[itr] = roc_auc_score(Y_val, Y_pred)      
        itr = itr + 1
    print('auPRC: '+str(np.average(auPRC)))
    print('auROC: '+str(np.average(auROC)))
    return np.average(auPRC), np.average(auROC)


# In[378]:


lr_model = LR(C=1000000000,penalty='l2')
rfc_model = RFC(n_estimators=500)
gbc_model = GBC(n_estimators=100,max_depth=64)
svc_model = SVC(C=100000, gamma=100)
lr_auPRC, lr_auROC = cross_validation(lr_model, X, Y, 5)
rfc_auPRC, rfc_auROC = cross_validation(rfc_model, X, Y, 5)
gbc_auPRC, gbc_auROC = cross_validation(gbc_model, X, Y, 5)
svc_auPRC, svc_auROC = cross_validation(svc_model, X, Y, 5)


# In[9]:


input = '/Users/zhouman/Desktop/LAB/IBD/new_version/Sample-Selectedstrain-matrix_12.csv'
data = pd.read_csv(input)

x = data[data.columns[1:-1]]
y = data[data.columns[-1]]

X = np.array(x)
Y = np.array(y)

lr_model = LR(C=1000000000,penalty='l2')
rfc_model = RFC(n_estimators=500)
gbc_model = GBC(n_estimators=100,max_depth=64)
svc_model = SVC(C=100000, gamma=100)
lr_auPRC, lr_auROC = cross_validation(lr_model, X, Y, 5)
rfc_auPRC, rfc_auROC = cross_validation(rfc_model, X, Y, 5)
gbc_auPRC, gbc_auROC = cross_validation(gbc_model, X, Y, 5)
svc_auPRC, svc_auROC = cross_validation(svc_model, X, Y, 5)

n_fold = 5
auPRC = np.zeros([n_fold])
auROC = np.zeros([n_fold])
kf = KFold(X.shape[0], n_fold, shuffle=True)
itr = 0
for train_index, val_index in kf:
    X_train = X[train_index, :]
    X_val = X[val_index, :]
    Y_train = Y[train_index]
    Y_val = Y[val_index]
    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_eval = lgb.Dataset(X_val, Y_val, reference=lgb_train)
    params = {'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'binary_logloss',
            'metric_freq': 1,
            'is_training_metric': 'true',
            'max_bin': 255,
            'num_trees': 100,
            'learning_rate': 0.1,
            'num_leaves': 20,
            'tree_learner': 'serial',
            'feature_fraction': 0.9,
            'bagging_freq': 5,
            'bagging_fraction': 0.8,
            'verbose': 0}
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=50)
    # predict
    p = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    #ind = np.argmax(p, axis=1)
    auPRC[itr] = average_precision_score(Y_val, p)
    auROC[itr] = roc_auc_score(Y_val, p)  
    itr += 1
print(np.average(auPRC), np.average(auROC))


# In[13]:


n_fold = 5
auPRC = np.zeros([n_fold])
auROC = np.zeros([n_fold])
kf = KFold(X.shape[0], n_fold, shuffle=True)
itr = 0
for train_index, val_index in kf:
    X_train = X[train_index, :]
    X_val = X[val_index, :]
    Y_train = Y[train_index]
    Y_val = Y[val_index]
    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_eval = lgb.Dataset(X_val, Y_val, reference=lgb_train)
    params = {'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'binary_logloss',
            'metric_freq': 1,
            'is_training_metric': 'true',
            'max_bin': 255,
            'num_trees': 100,
            'learning_rate': 0.1,
            'num_leaves': 20,
            'tree_learner': 'serial',
            'feature_fraction': 0.9,
            'bagging_freq': 5,
            'bagging_fraction': 0.8,
            'verbose': 0}
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=50)
    # predict
    p = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    #ind = np.argmax(p, axis=1)
    auPRC[itr] = average_precision_score(Y_val, p)
    auROC[itr] = roc_auc_score(Y_val, p)  
    itr += 1
print(np.average(auPRC), np.average(auROC))

