library(mnormt)
library(MCMCpack)
library(MASS)

# Function VarSel() is used to select important variables in a 
# reproducing kernel. All conditional posterior distributions are # standard.
# Input:
# 	Y: response variable (nx1)
# 	X: matrix of covariates (nxnumCov)
#	Candid: matrix of candidate variables (nxp)
# 	numCov: number of covariates
#	rho: the scale parameter in a Gaussian kernel. The rho can be estimated using the 
#		function EstRho(), but our simulation results indicate that taking rho=1
#		works fine in general. 
# 	numItr: number of iterations per chain
# 	numChain: number of MCMC chains
# 	a0 and b0 are for the prior of sigma^2, sig2. We use the same for tau
# 	sigB2 is the variance in the prior of beta

# Output: this includes three pieces
# 	a)  quantiles obtained by treating (n - n1) x numChain matrix as samples  
#         from posterior of the first beta and tau, where n1 is the number of iterations after burn-in.
#     b) Gelman and Rubin's convergence measure R and an upper confidence
#         limit for R (values near 1 are desirable) for each of beta and tau
#	c) model size
#	d) indices of selected variables
# 	e) posterior probability of selecting each variable
# 	f) ordered posterior probabilities of each variable

VarSel<-function(Y, X, Candid, numCov=1, rho=1, numItr=50000, numChain=2,a0=0.001,b0=0.001,sigB2=10)
{
	sampsize<-length(Y)
	numCandid<-ncol(Candid) 
	# Gaussian kernel is used
	distance<-array(0,dim=c(numCandid,sampsize,sampsize))

	for (i in 1:(sampsize))
	{
		distance[,,i]<-t((Candid - matrix(1, ncol=1, nrow=sampsize)%*%(as.matrix(Candid[i,])))^2)
	}
	paramArray<-array(0,dim=c(numItr,numChain,numCov+1))

	delta<-rbinom(numCandid,1,0.5)

	K<-updateK(delta,distance,sampsize,rho)
	for (i in 1:numChain)
	{
		set.seed(seed=1000*i)

		beta<-rnorm(numCov)
		sig2<-0.1*i 
		tau<-i
		h<-as.vector(rmnorm(n=1,0,tau*K))
		q<-rep(0.5,numCandid)


		for (j in 1:numItr)
		{
			#update beta
			SolveSigma<-solve(tau*K+diag(sig2,sampsize))
			beta<-updateBeta(Y,X,h,numCov,sig2,sigB2)
			h<-updateH(Y,X,K,tau,sig2,beta)

			# update tau
			tau<-updateTau(K,h,a0,b0,sampsize)
		
			SigTemp<-tau*K+diag(1,ncol(K))*sig2

			SolveSigma<-solve(SigTemp)
			eigVal<-eigen(SolveSigma,symmetric=TRUE,only.values=TRUE)$values
	
			detSolveSigma<-min(max(prod(eigVal),10^(-322)),10^300)

			# update sigma^2, sig2
			sig2<-updateSig2(Y, X, K,h,beta,a0,b0,sampsize)
			SigTemp<-tau*K+diag(1,ncol(K))*sig2
			SolveSigma<-solve(SigTemp)
			eigVal<-eigen(SolveSigma,symmetric=TRUE,only.values=TRUE)$values
			detSolveSigma<-min(max(prod(eigVal),10^(-322)),10^300)

			paramArray[j,i,1:numCov]<-beta
			paramArray[j,i,numCov+1]<-tau
			# update delta
	
			randIndex<-sample(numCandid,numCandid)
			for (kk in 1:numCandid)
			{
				s<-randIndex[kk]
				a<-(-0.5)*t(Y-X%*%beta)%*%SolveSigma%*%(Y-X%*%beta)+0.5*log(detSolveSigma)

				deltaTmp<-delta
				deltaTmp[s]<-1-delta[s]
				newK<-updateK(deltaTmp,distance,sampsize,rho)
				SigTemp<-tau*newK+diag(1,ncol(K))*sig2
				SolvesigTmp<-solve(SigTemp)
				eigVal<-eigen(SolvesigTmp,symmetric=TRUE,only.values=TRUE)$values
				detSolvesigTmp<-min(max(prod(eigVal),10^(-322)),10^300)

				b<-(-0.5)*t(Y-X%*%beta)%*%SolvesigTmp%*%(Y-X%*%beta)+0.5*log(detSolvesigTmp)

				maxab<-max(a,b)
				prob<-exp(a*delta[s]+b*(1-delta[s])-maxab)/(exp(a-maxab)+exp(b-maxab))
				deltaTmp[s]<-rbinom(1,1,prob)
	
				if (deltaTmp[s]!=delta[s])
				{
					delta[s]<-deltaTmp[s]
					K<-updateK(delta,distance,sampsize,rho)
					SolveSigma<-SolvesigTmp
					detSolveSigma<-detSolvesigTmp
				}
#				q[s]<-rbeta(1,aa+delta[s],bb+1-delta[s])
				q[s]<-0.5
			} # end for (kk in 1:numCandid)
			if (j==ceiling(numItr*2/3+1) && i==1)
			{
				sumDelta<-rep(0,length(delta))
				kkk<-0
			}
			else if (j>ceiling(numItr*2/3+1) && i==1)
			{
				sumDelta<-sumDelta+delta
				kkk<-kkk+1
			}
		}#end numItr
		
		sumDelta1<-sumDelta/kkk
	
		propDelta<-cbind(seq(1,numCandid),sumDelta1)
		colnames(propDelta)<-c("Variable Index","Post. Prob")
		orderProp<-propDelta[order(propDelta[,2],decreasing=T),]
		maxDiff<-max(abs(diff(orderProp[,2])))
		VarIndex<-which(abs(diff(orderProp[,2]))==maxDiff)
		VarSelect<-orderProp[1:VarIndex,1]
		VarSelect<-VarSelect[order(VarSelect)]
		
		modelSize<-length(VarSelect)
	}#end chain

	diagnosis1<-gandr.conv(paramArray[,,1])
	diagnosis2<-gandr.conv(paramArray[,,numCov+1])
	list(diagB1=diagnosis1,diagTau=diagnosis2,
		modelSize=modelSize,propDelta=propDelta,orderProp=orderProp,VarSelect=VarSelect)
}

# The grandr.conv function is to obtain posterior inferences and convergence diagnosis.
# It takes two inputs:
#        n x m matrix of posterior draws for a single parameter of interest
#              (n = number of iterations per chain; m = number of chains)
#        n1 = number of iterations to ignore as transient 
#             (defaults to 0.5*n if no number supplied)
# Output contains 3 pieces:
#      1. an approximate posterior 2.5,50,97.5% points (t-approximation)
#      2. quantiles obtained by treating (n - n1) x m matrix as samples  
#         from posterior
#      3. Gelman and Rubin's convergence measure R and an upper confidence
#         limit for R (values near 1, say less than 1.1 are desirable)
#

gandr.conv<-function(r, n1 = nrow(r)/2) {
#
#  r: matrix of simulated sequences
#  n1: length of initial transient to ignore (default = 0.5)
#
	alpha <- 0.05	# 95% intervals
	m <- ncol(r)
	x <- r[(n1 + 1):nrow(r),  ]	# part of simulated sequences to process
	n <- nrow(x)	# We compute the following statistics:
#
#  xdot:  vector of sequence means
#  s2:  vector of sequence sample variances (dividing by n-1)
#  W = mean(s2):  within MS
#  B = n*var(xdot):  between MS.
#  muhat = mean(xdot):  grand mean; unbiased under strong stationarity
#  varW = var(s2)/m:  estimated sampling var of W
#  varB = B^2 * 2/(m+1):  estimated sampling var of B
#  covWB = (n/m)*(cov(s2,xdot^2) - 2*muhat*cov(s^2,xdot)):
#                                               estimated sampling cov(W,B)
#  sig2hat = ((n-1)/n))*W + (1/n)*B:  estimate of sig2; unbiased under
#                                               strong stationarity
#  quantiles:  emipirical quantiles from last half of simulated sequences
#
	xdot <- as.vector(col.means(x))
	s2 <- as.vector(col.vars(x))
	W <- mean(s2)
	B <- n * var(xdot)
	muhat <- mean(xdot)
	varW <- var(s2)/m
	varB <- (B^2 * 2)/(m - 1)
	covWB <- (n/m) * (cov(s2, xdot^2) - 2 * muhat * cov(s2, xdot))
	sig2hat <- ((n - 1) * W + B)/n
	quantiles <- quantile(as.vector(x), probs = c(0.025, 0.25, 0.5, 0.75, 
		0.975))
	if(W > 1e-08) {
#
# non-degenerate case
# Posterior interval post.range combines all uncertainties
# in a t interval with center muhat, scale sqrt(postvar), 
# and postvar.df degrees of freedom.
#
#       postvar = sig2hat + B/(mn):  variance for the posterior interval
#                               The B/(mn) term is there because of the
#                               sampling variance of muhat.
#       varpostvar:  estimated sampling variance of postvar
#
		postvar <- sig2hat + B/(m * n)
		varpostvar <- (((n - 1)^2) * varW + (1 + 1/m)^2 * varB + 
                        2 * (n - 1) * (1 + 1/m) * covWB)/n^2
		post.df <- chisqdf(postvar, varpostvar)
		if (post.df<2) post.df=2.0001
		post.range <- muhat + 
                              sqrt(postvar) * qt(1-alpha/2, post.df) * c(-1,0,1)
#
# Estimated potential scale reduction (that would be achieved by
# continuing simulations forever) has two components:  an estimate and
# an approx. 97.5% upper bound.
#
# confshrink = sqrt(postvar/W), 
#     multiplied by sqrt(df/(df-2)) as an adjustment for the
#     width of the t-interval with df degrees of freedom.
#
# postvar/W = (n-1)/n + (1+1/m)(1/n)(B/W); we approximate the sampling dist.
# of (B/W) by an F distribution, with degrees of freedom estimated
# from the approximate chi-squared sampling dists for B and W.  (The
# F approximation assumes that the sampling dists of B and W are independent;
# if they are positively correlated, the approximation is conservative.)
#
		varlo.df <- chisqdf(W, varW)
		confshrink.range <- sqrt((c(postvar/W, (n - 1)/n + (1 + 1/m) * 
                       (1/n) * (B/W) * qf(0.975, m - 1, varlo.df)) * post.df)/
                       (post.df - 2))
		list(quantiles = quantiles, confshrink = 
			confshrink.range)
	}
	else {
#
# degenerate case:  all entries in "data matrix" are identical
#
		list(post = muhat * c(1, 1, 1), quantiles = quantiles, 
			confshrink = c(1, 1)) }
}
#
# some functions needed by the above
#
col.vars<-function(mat) {
	means <- col.means(mat)
	col.means(mat * mat) - means * means }
#
col.means<-function(mat) {
	ones <- matrix(1, nrow = 1, ncol = nrow(mat))
	ones %*% mat/nrow(mat) }
#
cov<-function(a, b) {
	m <- length(a)
	((mean((a - mean(a)) * (b - mean(b)))) * m)/(m - 1) }
#
chisqdf<-function(A, varA) {
	2 * (A^2/varA) }

#update beta
updateBeta<-function(Y,X,h,numCov,sig2,sigB2)
{
		SigmaBeta<-solve(t(X)%*%X*sig2^(-1)+diag((sigB2)^(-1),numCov))
		muBeta<-SigmaBeta%*%t(X)%*%(Y-h)/sig2
		rmnorm(n=1,muBeta,SigmaBeta)
}

# update tau
updateTau<-function(K,h,a0,b0,sampsize)
{
		shape<-sampsize/2+a0
		scale<-0.5*(t(h)%*%solve(K)%*%h+2*b0)
		rinvgamma(1,shape,scale)
}		

# update sigma^2, sig2
updateSig2<-function(Y, X, K,h,beta,a0,b0,sampsize)
{		
		shape<-sampsize/2+a0
		scale<-0.5*(t(Y-X%*%beta-h)%*%(Y-X%*%beta-h)+2*b0)
		rinvgamma(1,shape,scale)
}		

# update h
updateH<-function(Y,X,K,tau,sig2,beta)
{
		Sigmah<-solve(tau^(-1)*solve(K)+diag((sig2)^(-1),ncol(K)))
		muh<-Sigmah%*%(Y-X%*%beta)/sig2
		as.vector(rmnorm(n=1,muh,Sigmah))
}

# calculate the K matrix for different delta
updateK<-function(delta, distance,sampsize,rho)
{
	K<-matrix(rep(0,sampsize*sampsize),nrow=sampsize)

	for (i in 1:sampsize)
	{
		temp<-(t(distance[,,i])%*%delta)/rho
		K[i,]<-exp(-sqrt(temp))
	}
	return(K)
}

# Function EstRho() is used to estimate rho to be used in the subsequent variable selection
# Input is the same as the function VarSel()

# Output: this includes three pieces
# 	a)  quantiles obtained by treating (n - n1) x numChain matrix as samples  
#         from posterior, where n1 is the number of iterations after burn-in.
#	    The median will be used as the estimate of rho from the full model. 
#     b) Gelman and Rubin's convergence measure R and an upper confidence
#         limit for R (values near 1 are desirable)

EstRho<-function(Y, X, Candid, numCov=1, numItr=50000, numChain=1, a0=0.001,b0=0.001,V=0.1,sigB2=10)
{
	sampsize<-length(Y)
	numCandid<-ncol(Candid) 

	# Gaussian kernel is used
	distance<-array(0,dim=c(numCandid,sampsize,sampsize))

	for (i in 1:(sampsize))
	{
		distance[,,i]<-t((Candid - matrix(1, ncol=1, nrow=sampsize)%*%(as.matrix(Candid[i,])))^2)
	}
	rhoMat<-matrix(NA,nrow=numItr,ncol=numChain)
	for (i in 1:numChain)
	{
		beta<-rnorm(numCov)
		sig2<-0.1*i 
		tau<-1
		rho<-0.2
		delta<-rep(1,numCandid)
		K<-updateK(delta,distance,sampsize,rho)
		h<-as.vector(rmnorm(n=1,0,tau*K))
		countRho<-0
		set.seed(i*2000)

		for (j in 1:numItr)
		{
			cat("j ",j,"\n")
			beta<-updateBeta(Y,X,h,numCov,sig2,sigB2)
			tau<-updateTau(K,h,a0,b0,sampsize)
			sig2<-updateSig2(Y, X, K,h,beta,a0,b0,sampsize)
			h<-updateH(Y,X,K,tau,sig2,beta)

		
			#update rho in the kernel function
			SolveSigma<-solve(tau*K+diag(sig2,sampsize))
			eigVal<-eigen(SolveSigma,symmetric=TRUE,only.values=TRUE)$values
			detSolveSigma<-min(max(prod(eigVal),10^(-322)),10^300)

			lhOld<-(-0.5)*t(Y-X%*%beta)%*%SolveSigma%*%(Y-X%*%beta)+0.5*log(detSolveSigma)+(-a0-1)*log(rho)-b0/rho
			RhoProp<-exp(rnorm(1,log(rho),V))
			newK<-updateK(delta,distance,sampsize,RhoProp)
			SigTemp<-tau*newK+diag(1,ncol(newK))*sig2
			SolvesigTmp<-solve(SigTemp)
			eigVal<-eigen(SolvesigTmp,symmetric=TRUE,only.values=TRUE)$values
			detSolvesigTmp<-min(max(prod(eigVal),10^(-322)),10^300)

			lhNew<-(-0.5)*t(Y-X%*%beta)%*%SolvesigTmp%*%(Y-X%*%beta)+0.5*log(detSolvesigTmp)+(-a0-1)*log(RhoProp)-b0/RhoProp
			AccRatio<-lhNew-lhOld+log(RhoProp)-log(rho)	
			judge<-runif(1)
			if (AccRatio>log(judge))			
			{
				countRho<-countRho+1
				rho<-RhoProp
				SolveSigma<-SolvesigTmp
				detSolveSigma<-detSolvesigTmp
			}
			rhoMat[j,i]<-rho
		}#end numItr
	}#end chain
	diagnosis<-gandr.conv(rhoMat)
	list(diag=diagnosis)
}

data<-read.table("C:\\Documents and Settings\\hzhang\\My Documents\\papers\\Arnab.Var.Selection\\paper\\SupplementalMaterial\\Linear.Example.txt", header=F)
Y<-data[,2]
X<-data[,3]
Candid<-data[,4:(ncol(data)-1)]

VarSelResults<-VarSel(Y, X, Candid, rho=1, numCov=1, numItr=1000, numChain=2, a0=0.001,b0=0.001,sigB2=10)