/*
File loading and other commonly used function module 
Author: Longshu Yang
Latest update: 2013/11/20
Version: 1.0
*/

#include "common.h"
using namespace std;

//Loading Functions
//Load .fna files
Str read_in_fna(Str In){
	Str Result;
	Ifstream in(In.data());
	if(!in){
		cout<<"Can't open " + In + "!"<<endl;
		exit(0);
	}
	else{
		Str Line;
		getline(in,Line);
		while(!in.eof()){
			getline(in,Line);
			if(!Line.empty())
				Result += Line;
		}
	}
	in.close();
	cout<<In + " has been loaded!"<<endl;
	return Result;
}

//Load annotation files such as .rnt, .ptt & .ano
ptt read_in_ptt(Str In){
	ptt Result;
	Ifstream in(In.data());
	if(!in){
		cout<<"Can't open " + In + "!"<<endl;
		exit(0);
	}
	else{
		Str Temp;
		getline(in,Temp);
		int loc = (int) Temp.find("1..");
		Str All_len = Temp.substr(loc + 3, Temp.length());
		convertFromString(Result.all_len,All_len);
		getline(in,Temp);
		getline(in,Temp);
		int i = 0;
		while(!in.eof()){
			int length = 0;
			Str Location,Strand,PID,Gene,Synonym,Code,COG,Product;
			in>>Location>>Strand>>length>>PID>>Gene>>Synonym>>Code>>COG>>Product;
			getline(in,Temp);
			Product += Temp;
			if(!Location.empty()){
				Ve_Str Parsed = string_parse(Location,"..");
				int begin, end;
				convertFromString(begin,Parsed[0]);
				convertFromString(end,Parsed[1]);
				Result.anno[Synonym].begin = begin;
				Result.anno[Synonym].end = end;
				Result.anno[Synonym].length = length;
				Result.anno[Synonym].COG = COG;
				Result.anno[Synonym].Gene = Gene;
				Result.anno[Synonym].Strand = Strand;
				Result.anno[Synonym].Product = Product;
				if(Gene != "-")
					Result.G2S[Gene] = Synonym;
				Result.anno[Synonym].num = i;
				if(Product != "hypothetical protein")
					Result.P2S[Product] = Synonym;
				Result.N2S[i] = Synonym;
				Result.order.push_back(Synonym);
				i ++;
			}
		}
	}
	in.close();
	cout<<In + " has been loaded!"<<endl;
	return Result;
}

//Load training set sequences
Ve_Str read_in_training(Str In){
	Ve_Str result;
	Ifstream in(In.data());
	if(!in){
		cout<<"Can't open " + In + "!"<<endl;
		exit(0);
	}
	else{
		while(!in.eof()){
			Str Seq, Temp;
			in>>Seq;
			getline(in,Temp);
			if(!Seq.empty())
				result.push_back(Seq);
		}
	}
	in.close();
	cout<<In + " has been loaded!"<<endl;
	return result;
}

//Load parameters of sigma factors 
Sig read_in_sig(Str In, bool WHI){
	Sig result;
	int reg,wm1l,wm1u,wm2l,wm2u,minl,minu,dis;//,num;
	reg = wm1l = wm1u = wm2l = wm2u = minl = minu = dis = 0; //num = 0;
	Str Ty;
	Ifstream in(In.data());
	if(!in){
		cout<<"Can't open " + In + "!"<<endl;
		exit(0);
	}
	else{
		while(!in.eof()){
			Str Temp;
			int temp;
			in>>Temp>>temp;
			if(!Temp.empty()){
				Str::size_type pos = Temp.find("Sigma");
				if(Temp == "Promoter_region:")
					reg = temp;
				else if(Temp == "TIS_region:")
					result.regi = temp;
				else if(Temp == "Scan_region:")
					result.regs = temp;
				else if(Temp == "WM10:"){
					wm1l = temp;
					in>>temp;
					wm1u = temp;
				}
				else if(Temp == "WM35:"){
					wm2l = temp;
					in>>temp;
					wm2u = temp;
				}
				else if(Temp == "Variation:")
					dis = temp;
				else if(Temp == "Minimum_distance:"){
					minl = temp;
					in>>temp;
					minu = temp;
				}
				else if(pos != Str::npos){
					result.Ty = Temp;
					//num = temp;
					//cout<<num<<endl;
				}
			}
		}
	}
	sig choice;
	if(WHI){	
		for(int i = 5; i <= dis; i ++){	
			for(int j = wm1l; j <= wm1u; j ++){
				for(int k = wm2l; k <= wm2u; k ++){
					for(int l = minl; l <= minu; l ++){
						choice.dis = i;
						choice.min = l;
						choice.wm1 = j;
						choice.wm2 = k;
						choice.reg = reg;
						result.para.push_back(choice);
					}
				}
			}
		}
	}
	else{
		for(int j = wm1l; j <= wm1u; j ++){
			choice.wm1 = j;
			choice.reg = reg;
			result.para.push_back(choice);
		}
	}
	return result;
}

//Load parameters of scoring thresholds
threshold read_in_threshold(Str In){
	threshold result;
	Ifstream in(In.data());
	if(!in){
		cout<<"Can't open " + In + "!"<<endl;
		exit(0);
	}
	else{
		while(!in.eof()){
			Str Temp;
			in>>Temp;
			if(!Temp.empty()){
				if(Temp == "long_distance:"){
					getline(in,Temp);
					Ve_Str temp = string_parse(Temp,"\t");
					for(int i = 0; i < temp.size(); i ++){
						if(!temp[i].empty()){
							double a = 0;
							convertFromString(a,temp[i]);
							result.thl.push_back(a);
						}
					}
				}
				else if(Temp == "short_distance:"){
					getline(in,Temp);
					Ve_Str temp = string_parse(Temp,"\t");
					for(int i = 0; i < temp.size(); i ++){
						if(!temp[i].empty()){	
							double a = 0;
							convertFromString(a,temp[i]);
							result.ths.push_back(a);
						}
					}
				}
			}
		}
	}
	cout<<In + " has been loaded!"<<endl;
	return result;
}

//Load position weight matrices with spacer score
sc read_in_parameters(Str In){
	sc result;
	Ifstream in(In.data());
	if(!in){
		cout<<"Can't open " + In + "!"<<endl;
		exit(0);
	}
	else{
		bool WM = false;
		bool SP = false;
		bool SD = false;
		bool MI = false;
		Ve_Map_C_D temps;
		while(!in.eof()){
			Str Line;
			getline(in,Line);
			if(!Line.empty()){
				Str::size_type posb = Line.find(">");
				Str::size_type posw = Line.find("weight matrix");
				Str::size_type poss = Line.find("score");
				Str::size_type posp = Line.find("spacer");
				Str::size_type posd = Line.find("start site");
				Str::size_type posm = Line.find("minimum");
				if(posb != Str::npos){
					WM = SP = SD = MI = false;
					if(!temps.empty())
						result.wm.push_back(temps);
					//cout<<result.wm.size()<<endl;
					temps.clear();
				}
				if(posw != Str::npos && poss != Str::npos){
					WM = true;
					continue;
				}
				else if(posp != Str::npos && poss != Str::npos){
					SP = true;
					continue;
				}
				else if(posd != Str::npos && poss != Str::npos){
					SD = true;
					continue;
				}
				else if(posm != Str::npos){
					MI = true;
					continue;
				}
				if(WM){
					Ve_Str score = string_parse(Line,"\t");
					Map_C_D temp;
					for(int i = 0; i < score.size(); i ++){
						if(!score[i].empty()){
							double p = 0;
							convertFromString(p,score[i]);
							temp[Base[i]] = p;
							//cout<<Base[i]<<"\t"<<p<<"\t";
						}
					}
					//cout<<endl;
					temps.push_back(temp);
					//int last = temps.size() - 1;
					//cout<<temps[last]['A']<<endl;
				}
				if(SP){
					Ve_Str score = string_parse(Line,"\t");
					Ve_D temp;
					for(int i = 0; i < score.size(); i ++){
						if(!score[i].empty()){
							double p = 0;
							convertFromString(p,score[i]);
							temp.push_back(p);
							//cout<<p<<"\t";
						}
					}
					cout<<endl;
					result.spacer.push_back(temp);
				}
				if(SD){
					Ve_Str score = string_parse(Line,"\t");
					for(int i = 0; i < score.size(); i ++){
						if(!score[i].empty()){
							double p = 0;
							convertFromString(p,score[i]);
							result.dis.push_back(p);
						}
					}
				}
				if(MI){
					Ve_Str score = string_parse(Line,"\t");
					for(int i = 0; i < score.size(); i ++){
						if(!score[i].empty()){
							int a = 0;
							convertFromString(a,score[i]);
							result.min.push_back(a);
						}
					}
				}
			}
		}
	}
	cout<<In + " has been loaded!"<<endl;
	return result;
}

//Load .rnt and .ptt file for integration
all read_in_info(Str In){
	all result;
	Ifstream in(In.data());
	if(!in){
		cout<<"Can't open " + In + "!"<<endl;
		exit(0);
	}
	else{
		Str Line;
		getline(in,result.Species);
		getline(in,result.Num);
		getline(in,result.Title);
		while(!in.eof()){
			getline(in,Line);
			Ve_Str parsed = string_parse(Line,"\t");
			Ve_Str loci = string_parse(parsed[0],"..");
			if(!loci[0].empty()){
				int begin, end = 0;
				convertFromString(begin,loci[0]);
				convertFromString(end,loci[1]);
				result.content[begin][end] = Line;
			}
		}
	}	
	cout<<In + " has been loaded!"<<endl;
	return result;
}


Map_Str_Str read_in_list(Str In){
	Map_Str_Str result;
	Ifstream in(In.data());
	if(!in){
		cout<<"Can't open " + In + "!"<<endl;
		exit(0);
	}
	else{
		Str Line;
		getline(in,Line);
		while(!in.eof()){
			Str Temp;
			in>>Temp>>Line;
			//getline(in,Line);
			if(!Temp.empty())
				result[Temp] = Line;
		}
	}
	cout<<In + " has been loaded!"<<endl;
	return result;
}

//Commonly Used Functions

char complement(char S){
	if(S == 'A')
		return 'T';
	else if(S == 'T')
		return 'A';
	else if(S == 'G')
		return 'C';
	else if(S == 'C')
		return 'G';
	else
		return S;
}

Str double_strand(Str Forward){
	Str Result;
	Result = Forward;
	for(int i = 0; i < Forward.size(); i ++){
		const int L = (int) Forward.size() - 1;
		Result[i] = complement(Forward[L - i]);
	}
	return Result;
}

double gc_content(Str Seq){
	double result = 0;
	int sum = 0;
	for(int i = 0; i < Seq.size(); i ++){
		if(Seq[i] == 'A' || Seq[i] == 'T')
			sum ++;
		else if(Seq[i] == 'C' || Seq[i] == 'G'){
			sum ++;
			result ++;
		}
	}
	result /= sum;
	return result;
}


Ve_Str string_parse(con_Str Line,con_Str Key){
	Ve_Str result;
	int posB = 0;
	int posE = (int) Line.find(Key);
	while(posE != Str::npos){
		Str Temp = Line.substr(posB,posE - posB);
		posB = posE + (int) Key.size();
		posE = (int) Line.find(Key,posB);
		result.push_back(Temp);
		//cout<<Temp<<"\t";
	}
	Str Temp = Line.substr(posB,posE - posB);
	//cout<<Temp<<endl;
	result.push_back(Temp);
	return result;
}

//K-mer extraction
Ve_Str kmer(int w,Ve_Str seqs){
	Ve_Str result;
	Map_Str_I found;
	for(int i = 0; i < seqs.size(); i ++){
		for(int j = 0; j < seqs[i].length() - w; j ++){
			Str Temp = seqs[i].substr(j,w);
			//cout<<Temp<<endl;
			if(found[Temp] < 0)
				found[Temp] = 0;
			else 
				found[Temp] ++;
		}
	}
	Ve_I com;
	Pa_I_I min = make_pair(0,10000);
	for(Map_Str_I::const_iterator iter = found.begin(); iter != found.end(); iter ++){
		//cout<<iter->first<<" "<<iter->second<<endl;
		if(result.size() < 20){
			result.push_back(iter->first);
			com.push_back(iter->second);
			if(iter->second < min.second){
				min.first = result.size() - 1;
				min.second = iter->second;
			}
		}
		else{
			if(iter->second > min.second){
				result[min.first] = iter->first;
				min.second = iter->second;
				com[min.first] = iter->second;
				for(int i = 0; i < com.size(); i ++){
					if(!i || com[i] < min.second){
						min.second = com[i];
						min.first = i;
					}
				}
			}
		}
	}
	cout<<"K-mer extraction has been completed!"<<endl;
	return result;
}


Ve_Map_C_D assign(Str Seq){
	Ve_Map_C_D result;
	for(int i = 0; i < Seq.size(); i ++){
		Map_C_D temp;
		temp[Seq[i]] = 0.52;
		for(int j = 0; j < Base.size(); j ++){
			if(Base[j] != Seq[i])
				temp[Base[j]] = 0.16;
		}
		result.push_back(temp);
	}
	return result;
}

Map_C_D assign(int a){
	Map_C_D result;
	for(int i = 0; i < Base.size(); i ++)
		result[Base[i]] = a;
	return result;
}

Ve_Map_C_D rate(Ve_Map_C_D w, Map_C_D bac){
	Ve_Map_C_D result = w;
	for(int i = 0; i < w.size(); i ++)
		for(int v = 0; v < Base.size(); v ++)
			result[i][Base[v]] /= bac[Base[v]];
	return result;
}


Ve_Map_C_D score_matrix(Ve_Map_C_D wm){
	Ve_Map_C_D result;
	for(int i = 0; i < wm.size(); i ++){
		result.push_back(assign(0));
		for(int v = 0; v < Base.size(); v ++)
			result[i][Base[v]] = log(wm[i][Base[v]]);
	}
	return result;
}

//Integration for .ano file
Str integrate(Str Ptt, Str Rnt, Str Out){
	all pt = read_in_info(Ptt);
	all rt = read_in_info(Rnt);
	Map_I_Map_I_Str one;
	for(Map_I_Map_I_Str::const_iterator iter0 = pt.content.begin(); iter0 != pt.content.end(); iter0 ++){
		for(Map_I_Str::const_iterator ite0 = iter0->second.begin(); ite0 != iter0->second.end(); ite0 ++){
			one[iter0->first][ite0->first] = ite0->second;
		}
	}
	for(Map_I_Map_I_Str::const_iterator iter1 = rt.content.begin(); iter1 != rt.content.end(); iter1 ++){
		for(Map_I_Str::const_iterator ite1 = iter1->second.begin(); ite1 != iter1->second.end(); ite1 ++){
			one[iter1->first][ite1->first] = ite1->second;
		}
	}
	Ofstream out(Out.data());
	out<<pt.Species<<endl;
	out<<pt.Num<<"\t"<<rt.Num<<endl;
	out<<pt.Title<<endl;
	for(Map_I_Map_I_Str::const_iterator iter = one.begin(); iter != one.end(); iter ++){
		for(Map_I_Str::const_iterator ite = iter->second.begin(); ite != iter->second.end(); ite ++){
			out<<ite->second<<endl;
		}
	}
	out.close();
	return "Integration of .ptt and .rnt has been done!";
}

Ve_Str region(int reg, Str DNA, ptt Ptt){
	Ve_Str result;
	Ofstream out("all 5'UTR.dat");
	for(int i = 0; i < Ptt.N2S.size(); i ++){
		Str Synonym = Ptt.N2S[i];
		int pos = 0;
		int re = 0;
		Str Seq;
		if(Ptt.anno[Synonym].Strand == "+"){
			pos = Ptt.anno[Synonym].begin - 1 - reg;
			re = Ptt.anno[Synonym].begin;
			if(pos < 0){
				Seq = DNA.substr(Ptt.all_len + pos,abs(pos));
				Seq += DNA.substr(0,Ptt.anno[Synonym].begin - 1);
			}
			else
				Seq = DNA.substr(pos,reg);
		}
		else{
			pos = Ptt.anno[Synonym].end - 1;
			re = Ptt.anno[Synonym].end;
			if(pos + reg >= Ptt.all_len){
				Seq = DNA.substr(pos,DNA.length() - pos - 1);
				Seq += DNA.substr(0,pos + reg + 1 - DNA.length());
				Seq = double_strand(Seq);
			}
			else{
				Seq = DNA.substr(pos,reg);
				Seq = double_strand(Seq);
			}
		}

		out<<Synonym<<"\t"<<re<<"\t"<<Seq<<endl;
		result.push_back(Seq);
	}
	out.close();
	return result;
}

De_I traversal(int i, Ve_I limit){
	De_I result;
	int final = limit.size() - 1;
	int prod = 1;
	for(int j = final; j >= 0; j --){
		if(!j)
			result.push_front(i / prod);
		else{
			result.push_front((i / prod) % limit[j]);
			//cout<<i<<"\t"<<((i / prod) % limit[j])<<"\t"<<limit.size()<<endl;
			prod *= limit[j];
		}
	}
	return result;
}

int dist(Str Synonym, ptt ano){
	int result;
	if(ano.anno[Synonym].Strand == "+"){
		if(!ano.anno[Synonym].num){
			int n = ano.N2S.size() - 1;
			result = ano.all_len - ano.anno[ano.N2S[n]].end + ano.anno[Synonym].begin;
		}
		else{
			int n = ano.anno[Synonym].num - 1;
			Str Previous = ano.N2S[n];
			result = ano.anno[Synonym].begin - ano.anno[Previous].end;
		}
	}
	else{
		int n = ano.N2S.size() - 1;
		if(ano.anno[Synonym].num == n)
			result = ano.all_len - ano.anno[Synonym].end + ano.anno[ano.N2S[0]].begin;
		else{
			int m = ano.anno[Synonym].num + 1;
			Str Previous = ano.N2S[m];
			result = ano.anno[Previous].begin - ano.anno[Synonym].end;
		}
	}
	return result;
}

int out_range(Str Synonym, ptt ano){
	int result, b, e;
	result = b = e = 0;
	if(ano.anno[Synonym].Strand == "+"){
		int prev = ano.anno[Synonym].num - 1;
		if(prev < 0)
			prev += ano.N2S.size();
		Str Prev = ano.N2S[prev];
		while(ano.anno[Prev].Strand != "+"){
			prev --;
			if(prev < 0)
				prev += ano.N2S.size();
			Prev = ano.N2S[prev];
		}
		b = ano.anno[Prev].begin;
		e = ano.anno[Synonym].begin;
	}
	else{
		int prev = ano.anno[Synonym].num + 1;
		if(prev >= ano.N2S.size())
			prev -= ano.N2S.size();
		Str Prev = ano.N2S[prev];
		while(ano.anno[Prev].Strand != "-"){
			prev ++;
			if(prev >= ano.N2S.size())
				prev -= ano.N2S.size();
			Prev = ano.N2S[prev];
		}
		e = ano.anno[Prev].end;
		b = ano.anno[Synonym].end;
	}
	if(b > e)
		result = ano.all_len - b + e;
	else 
		result = e - b;
	return result;
}

Str direction(Str Synonym, ptt ano){
	Str Result, Prev;
	int curr = ano.anno[Synonym].num;
	if(ano.anno[Synonym].Strand == "+"){
		if(!curr)
			Prev = ano.N2S[ano.N2S.size() - 1];
		else
			Prev = ano.N2S[curr - 1];
	}
	else{
		if(curr == ano.N2S.size() - 1)
			Prev = ano.N2S[0];
		else
			Prev = ano.N2S[curr + 1];
	}
	if(ano.anno[Synonym].Strand == ano.anno[Prev].Strand)
		Result = "Par";
	else
		Result = "Div";
	return Result;
}


