rm (list = ls ())

##import data
Data <- read.csv('F:\\Project\\Project9-DiseasesDiscrimination\\step1FeatureSelection\\Sample-strain-matrix.csv',header = T, row.names = 1,check.names = FALSE)
Data <- Data[,-ncol(Data)] #delete unknown

##get the 349 rows matrix
DataPath <- "F:\\Project\\Project9-DiseasesDiscrimination\\数据\\"
#SampleList.healthy <- read.csv(paste(DataPath,'SampleList_healthy.csv',sep = ""),header = FALSE)
SampleList.UD <- read.csv(paste(DataPath,'SampleList_UD.csv',sep = ""),header = FALSE)
SampleList.CD <- read.csv(paste(DataPath,'SampleList_CD.csv',sep = ""),header = FALSE)
#SampleList.healthy <- as.character(SampleList.healthy$V1)
SampleList.UD <- as.character(SampleList.UD$V1)
SampleList.CD <- as.character(SampleList.CD$V1)
Data.subset <- Data[c(SampleList.UD,SampleList.CD),]

##set the extremly small number to zero and then delete the all zero columns
Data.matrix <- as.matrix(Data.subset) 
# Data.matrix[Data.matrix<1e-08] <- 0 #for FTU matrix
Data.matrix[Data.matrix<1e-06] <- 0  #for strain matrix
Data <- as.data.frame(Data.matrix)
Data.subset <- Data[,colSums(Data)>0] #去除全零列，即全零的特征

n<-length(SampleList.UD)

## Wicoxon rank-sum test
feaNum <- ncol(Data.subset)

p.value <- rep(NA,times=feaNum)
for ( j in 1:feaNum){
  group1_freq <- as.numeric(Data.subset[1:n,j])
  group2_freq <- as.numeric(Data.subset[n+1:nrow(Data.subset),j])
  p.value[j] = wilcox.test(group1_freq,group2_freq,exact = FALSE)$p.value
}

Data.subset['P value',] <- p.value
Data.subset <- Data.subset[,Data.subset['P value',]<0.01]
Data.subset <- Data.subset[-nrow(Data.subset),]
#write.csv(Data.subset, file = "F:\\Project\\DiseasesDiscrimination\\数据\\FeatureSelection\\Sample-FTU-matrix-sub.csv")

## correlation network and the cor node selection from the network

n <- length(Data.subset)
Data.subset.cor.r <- matrix(0,nrow = n,ncol = n)
Data.subset.cor.p <- matrix(0,nrow = n,ncol = n)
for (i in 1:n){
  tmp <- apply(Data.subset,2,function(x){cor.test(Data.subset[,i],x,type = "spearman")})
  Data.subset.cor.r[i,] <- sapply(1:n, function(x){tmp[[x]]$estimate})
  Data.subset.cor.p[i,] <- sapply(1:n, function(x){tmp[[x]]$p.value})
}

Data.subset.cor.r[Data.subset.cor.p>0.05] <- 0

#iteration
Graph <- apply(Data.subset.cor.r, 1, function(x){abs(x)})
Feature.list <- colnames(Data.subset)
Feature.selected <- c()
MaxNode <- 500

# for (i in 1:MaxNode)
for (i in 1:MaxNode){
  Graph.rowsum <- apply(Graph, 1, function(x){(sum(x))})
  ind <- which(Graph.rowsum==max(Graph.rowsum), arr.ind=T) #select the max row
  ind <- ind[1]
  IND <- Graph[ind,]> 0.4 #the closely connected nodes index
  Feature.selected <- c(Feature.selected, Feature.list[ind])
  IND
  Graph <- Graph[!IND,!IND]
  Feature.list <- Feature.list[!IND]
  if(dim(Graph)[1]<3)  break}

Data.subset1 <- Data.subset[,Feature.selected]

Data.subset1[,'lab']<- c(rep('0',length(SampleList.UD)),rep('1', length(SampleList.CD)))
Data.subset1$lab <- as.factor(Data.subset1$lab)

Data.subset2<-data.frame(Data.subset1[,-ncol(Data.subset1)],Data.subset1$lab)

write.csv(Data.subset2, file = "F:\\Project\\Project9-DiseasesDiscrimination\\step4WGS-12\\Sample-Selectedstrain-matrix.csv")




