/*
Promoter scanning module 
Author: Longshu Yang
Latest update: 2013/11/20
Version: 1.0
*/

#include "two.h"
using namespace std;

void TWOEM::back_ground(Ve_Str seqs){
	B = pb = assign(0);
	double sum = 0;
	for(int i = 0; i < seqs.size(); i ++){
		for(int j = 0; j < seqs[i].size(); j ++){
			B[seqs[i][j]] ++;
			sum ++;
		}
	}
	for(int v = 0; v < Base.size(); v ++){
		pb[Base[v]] = B[Base[v]] / sum;
		O = B;
	}
	//B = assign(0);
}


TWOEM::TWOEM(sig signal, Ve_Str Kmer, Ve_Str data){
	reg = signal.reg;
	w1 = signal.wm1;
	w2 = signal.wm2;
	dist = signal.dis;
	lower = signal.min;
	start = reg - w1 - w2 - lower;
	ps.resize(start + 1,1.0 / (start + 1));
	max = 0;
	//ps.resize(2,0.5);
	pd.resize(dist,1.0 / dist);
	back_ground(data);
	for(int i = 0; i < w1; i ++)
		wm1.push_back(assign(0));
	for(int i = 0; i < w2; i ++)
		wm2.push_back(assign(0));
	rwm1 = assign(Kmer[0]);
	rwm2 = assign(Kmer[1]);
	rwm1 = rate(rwm1,pb);
	rwm2 = rate(rwm2,pb);
	C.resize(dist,Ve_D(start,0));
	//EM(1,data);
	//maxloop = signal.time;
}


//Likelihood score
Str TWOEM::EM(int t, Ve_Str data){
	loglike = last = 0;
	int loop;
	for(loop = 0; loop < t; loop ++){
		Ve_Ve_D lam(dist,Ve_D(start,0));
		for(int k = 0; k < data.size(); k ++){
			double sumk = 0;
			sumk += ps[start];
			//cout<<sumk<<"\t"<<start<<endl;
			//sumk += ps[1];
			for(int n = 0; n < dist; n ++){
				for(int j = 0; j < start - n; j ++){
					int b = j + w2 + n + lower;
					//cout<<b<<"\t"<<j<<"\t"<<n<<"\t"<<lower<<"\t"<<data[k]<<endl;
					double v = 1;
					for(int i = 0; i < w2; i ++)
						v *= rwm2[i][data[k][i + j]];
					for(int i = 0; i < w1; i ++)
						v *= rwm1[i][data[k][i + b]];
					lam[n][j] = v * ps[j] * pd[n];
					//lam[n][j] = v * ps[0] * pd[n];
					sumk += lam[n][j];
				}
			}
			//cout<<sumk<<endl;
			p0 += ps[start] / sumk;
			//p0 += ps[1] / sumk;
			loglike += log(sumk);
			for(int n = 0; n < dist; n ++){
				for(int j = 0; j < start - n; j ++){
					lam[n][j] /= sumk;
					C[n][j] += lam[n][j];
					//sumc += lam[k][n][j];
					for(int i = 0; i < w2; i ++)
						wm2[i][data[k][i + j]] += lam[n][j]; 
					int b = j + w2 + n + lower;
					for(int i = 0; i < w1; i ++)
						wm1[i][data[k][i + b]] += lam[n][j];
				}
			}
		}
		//cout<<loop<<" "<<loglike<<"\t";
		if(abs(loglike - last) < abs(1.0e-6 * loglike))
			break;
		else{
			last = loglike;
			loglike = 0;
			if(t != 1)
				calcu();
		}
	}
	//cout<<endl;
	Str Temp;
	convertFromNumber(Temp,loop);
	return "EM iteration with " + Temp + " times has been completed!";
}


//Parameters calculation
Str TWOEM::calcu(){
	double sumb = 0;
	Ve_D sumw1(w1,0);
	Ve_D sumw2(w2,0);
	double sumc = 0;
	for(int n = 0; n < dist; n ++){
		for(int j = 0; j < start; j ++)
			sumc += C[n][j];
	}
	//sum up!!
	for(int v = 0; v < Base.size(); v ++){
		Map_C_D sumw = assign(0);
		for(int i = 0; i < w1; i ++){
			sumw1[i] += wm1[i][Base[v]];
			sumw[Base[v]] += wm1[i][Base[v]];
		}
		for(int i = 0; i < w2; i ++){
			sumw2[i] += wm2[i][Base[v]];
			sumw[Base[v]] += wm2[i][Base[v]];
		}
		B[Base[v]] -= sumw[Base[v]];
		//B[Base[v]] += sumw[Base[v]];
		sumb += B[Base[v]];
	}
	//Expectation and Clearance
	for(int v = 0; v < Base.size(); v ++){
		pb[Base[v]] = B[Base[v]] / sumb;
		//cout<<pb[Base[v]]<<" "<<B[Base[v]]<<endl;
		B[Base[v]] = O[Base[v]];
		//B[Base[v]] = 0;
		for(int i = 0; i < w1; i ++){
			wm1[i][Base[v]] /= sumw1[i];
			rwm1[i][Base[v]] = wm1[i][Base[v]] / pb[Base[v]];
			//cout<<wm1[i][Base[v]]<<"\t";
			wm1[i][Base[v]] = 0;
		}
		//cout<<endl;
		for(int i = 0; i < w2; i ++){
			wm2[i][Base[v]] /= sumw2[i];
			rwm2[i][Base[v]] = wm2[i][Base[v]] / pb[Base[v]];
			//cout<<wm2[i][Base[v]]<<"\t";
			wm2[i][Base[v]] = 0;
		}
		//cout<<endl;
	}
	ps.clear();
	pd.clear();
	ps.resize(start + 1,0);
	//ps.resize(2,0);
	pd.resize(dist,0);
	for(int n = 0; n < dist; n ++){
		for(int j = 0; j < start; j ++){
			ps[j] += C[n][j] / (sumc + p0);
			pd[n] += C[n][j] / sumc;
			//cout<<psd[n][j]<<"\t";
			C[n][j] = 0;
		}
		//cout<<endl;
	}
	ps[start] = p0 / (sumc + p0);
	//ps[1] = p0 / (sumc + p0);
	p0 = 0;
	sumc = 0;
	return "Parameter calculation has been completed!";
}


TWOEM TSS2_pre(sig signal,Ve_Str data, Str NC, Str Out){
	Ofstream out(Out.data());
	Ve_Str Kmer1 = kmer(signal.wm1,data);
	Ve_Str Kmer2 = kmer(signal.wm2,data);
	Ve_Str kbest;
	double last = 0;
	for(int i = 0; i < Kmer1.size(); i ++){
		for(int j = 0; j < Kmer2.size(); j ++){
			Ve_Str ktemp;
			ktemp.push_back(Kmer1[i]);
			ktemp.push_back(Kmer2[j]);
			TWOEM temp(signal,ktemp,data);
			temp.EM(1,data);
			if(!j && !i){
				cout<<Kmer1[i]<<"\t"<<Kmer2[j]<<"\t"<<temp.last<<endl;
				last = temp.last;
				kbest = ktemp;
			}
			if(temp.last > last){
				cout<<Kmer1[i]<<"\t"<<Kmer2[j]<<"\t"<<temp.last<<endl;
				last = temp.last;
				kbest = ktemp;
			}
		}
	}
	TWOEM best(signal,kbest,data);
	cout<<best.EM(maxloop,data)<<endl;
	best.max = max_score(best.rwm1, best.rwm2, best.pb, best.pd);
	out<<"Promoter features:"<<endl;
	out<<"max score: "<<best.max<<endl;
	out<<"TSS background probability:"<<endl;
	for(int v = 0; v < Base.size(); v ++)
		out<<best.pb[Base[v]]<<"\t";
	out<<endl<<"TSS start site probability:"<<endl;
	for(int k = 0; k < best.ps.size() - 1; k ++)
		out<<best.ps[k]<<"\t";
	out<<endl<<"no signal probability:"<<endl<<best.ps[best.ps.size() - 1]<<endl
		<<">minimum spacer:\n"<<signal.min<<endl<<">TSS spacer probability:"<<endl;
	//out<<endl<<"TSS start site probability: "<<best.ps[0]<<endl<<"No signal probability:"<<endl
	//	<<best.ps[1]<<endl<<"TSS internal box distance:"<<endl;
	for(int k = 0; k < best.pd.size(); k ++)
		out<<best.pd[k]<<"\t";
	out<<endl<<">TSS spacer score:"<<endl;
	for(int k = 0; k < best.pd.size(); k ++){
		if(!k)
			out<<log(best.pd[k] * best.pd.size()) / log(2.0);// * best.pd[k];
		else
			out<<"\t"<<log(best.pd[k] * best.pd.size()) / log(2.0);// * best.pd[k];
	}
	out<<endl<<">weight matrix -35 score:"<<endl;
	for(int k = 0; k < best.wm2.size(); k ++){
		for(int v = 0; v < Base.size(); v ++){
			//out<<best.rwm2[k][Base[v]] * best.pb[Base[v]]<<"\t";
			if(!v)
				out<<log(best.rwm2[k][Base[v]]) / log(2.0);
				//* best.rwm2[k][Base[v]] * best.pb[Base[v]];
			else
				out<<"\t"<<log(best.rwm2[k][Base[v]]) / log(2.0);
				//* best.rwm2[k][Base[v]] * best.pb[Base[v]];
		}
		out<<endl;
	}
	out<<">weight matrix -35 probability:"<<endl;
	for(int k = 0; k < best.wm2.size(); k ++){
		for(int v = 0; v < Base.size(); v ++){
			//out<<best.rwm2[k][Base[v]] * best.pb[Base[v]]<<"\t";
			if(!v)
				out<<best.rwm2[k][Base[v]] * best.pb[Base[v]];
			else
				out<<"\t"<<best.rwm2[k][Base[v]] * best.pb[Base[v]];
		}
		out<<endl;
	}
	out<<">weight matrix -10 score:"<<endl;
	for(int k = 0; k < best.wm1.size(); k ++){
		for(int v = 0; v < Base.size(); v ++){
			if(!v)
				out<<log(best.rwm1[k][Base[v]]) / log(2.0);
				//* best.rwm1[k][Base[v]] * best.pb[Base[v]];
			else
				out<<"\t"<<log(best.rwm1[k][Base[v]]) / log(2.0);
				//* best.rwm1[k][Base[v]] * best.pb[Base[v]];
		}
		out<<endl;
	}
	out<<">weight matrix -10 probability:"<<endl;
	for(int k = 0; k < best.wm1.size(); k ++){
		for(int v = 0; v < Base.size(); v ++){
			if(!v)
				out<<best.rwm1[k][Base[v]] * best.pb[Base[v]];
			//out<<best.rwm1[k][Base[v]] * best.pb[Base[v]]<<"\t";
			else
				out<<"\t"<<best.rwm1[k][Base[v]] * best.pb[Base[v]];
		}
		out<<endl;
	}
	out.close();
	cout<<"TSSs of " + NC + " have been predicted!"<<endl;
	return best;
}

double max_score(Ve_Map_C_D sm1, Ve_Map_C_D sm2, Map_C_D pb, Ve_D pd){
	double result = 0;
	for(int i = 0; i < sm1.size(); i ++){
		for(int j = 0 ; j < Base.size(); j ++)
			result += log(sm1[i][Base[j]]) / log(2.0) * sm1[i][Base[j]] * pb[Base[j]];
	}
	//cout<<endl;
	for(int i = 0; i < sm2.size(); i ++){
		for(int j = 0; j < Base.size(); j ++)
			result += log(sm2[i][Base[j]]) / log(2.0) * sm2[i][Base[j]] * pb[Base[j]];
	}
	//result /= (sm1.size() + sm2.size());
	for(int i = 0; i < pd.size(); i ++)
		result += pd[i] * log(pd[i] * pd.size()) / log(2.0);
	return result;
}
