library(mnormt)
library(msm)

# Function VarSel() is used to select important variables in a reproducing kernel
# Input:
# 	Y: response variable (nx1)
# 	X: matrix of covariates (nxnumCov)
#	Candid: matrix of candidate variables (nxp)
# 	numCov: number of covariates
#	rho: the scale parameter in the Gaussian kernel
#	tau: the overall set effect
#	numChain: number of MCMC chains
# 	numItr: number of iterations per chain
# 	sigBeta is the variance in the prior of beta

# Output: this includes three pieces
# 	a)  quantiles obtained by treating (n - n1) x numChain matrix as samples  
#         from posterior of the first beta, where n1 is the number of iterations after burn-in.
#     b) Gelman and Rubin's convergence measure R and an upper confidence
#         limit for R (values near 1 are desirable) for each of beta
#	c) model size
#	d) indices of selected variables
# 	e) posterior probability of selecting each variable
# 	f) ordered posterior probabilities of each variable

VarSel<-function(Y, X, Candid, numCov=1, rho=1, tau=0.8, numItr=100, numChain=2,sigBeta=10)
{
	set.seed(12345)
	sampsize<-length(Y)
	numCandid<-ncol(Candid) 
	paramArray<-array(0,dim=c(numItr,numChain,numCov+1))

	# Gaussian kernel is used
	distance<-array(0,dim=c(numCandid,sampsize,sampsize))
	set.seed(12345)
	delta<-rbinom(numCandid,1,0.5)

	for (i in 1:(sampsize))
	{
		distance[,,i]<-t((Candid - matrix(1, ncol=1, nrow=sampsize)%*%(as.matrix(Candid[i,])))^2)
	}
	K<-updateK(delta,distance,sampsize,rho)
	error<-0
	for (i in 1:numChain)
	{
		beta<-rnorm(numCov)
		meanz<-rep(0,length(Y))
		varz<-diag(1,length(Y),length(Y))
		Z<-rmnorm(1,meanz,varz)
		delta<-rbinom(numCandid,1,0.5)
		K<-updateK(delta,distance,sampsize,rho)
		error<-0

		for (j in 1:numItr)
		{
			cat("iter ",j,"\n")
			var<-tau*K+diag(-tau+1,nrow(K),ncol(K))
			invVar<-try(solve(var))
			if (class(invVar)=="try-error")
			{
				error<-1
				break
			}
			else
			{
				invVar<-solve(var)
			}

			#update Z (latent normal)
			for (jj in 1:length(X))
			{
				rowvarnoi<-var[jj,]
				rowvarnoi<-rowvarnoi[-jj]

				rowinvVarnoi<-invVar[jj,]
				rowinvVarnoi<-rowinvVarnoi[-jj]
				invVarnoi<-invVar[-jj,-jj]-(rowinvVarnoi)%*%t(rowinvVarnoi)/invVar[jj,jj]

				meanZi<-rowvarnoi%*%invVarnoi%*%(Z[-jj]-X[-jj]%*%t(beta))+X[jj]%*%beta
				varZi<-1-rowvarnoi%*%invVarnoi%*%(rowvarnoi)
				if (varZi<0)
				{
					varZi<-10^(-40)
				}
				sdvarZi<-sqrt(varZi)
				if (Y[jj]==1)
				{
					if (sdvarZi<10^(-15) && meanZi<0)
					{
						Z[jj]<-10^(-10)
					}
					else
					{
						Z[jj]<-rtnorm(1,meanZi,sdvarZi,lower=0)
					}
				}
				else
				{
					if (sdvarZi<10^(-15) && meanZi>0)
					{
						Z[jj]<-0
					}
					else
					{
						Z[jj]<-rtnorm(1,meanZi,sdvarZi,upper=0)
					}
				}
			}

			#update beta
			Vbeta<-try(solve(diag(sigBeta^(-1),numCov,numCov)+t(X)%*%invVar%*%X))
			if (class(Vbeta)=="try-error")
			{
				error<-1
				break
			}
			else
			{
				Vbeta<-solve(diag(sigBeta^(-1),numCov,numCov)+t(X)%*%invVar%*%X)
				Mbeta<-Vbeta%*%(X%*%invVar%*%t(Z))
				beta<-rmnorm(1,Mbeta,Vbeta)
			}
			paramArray[j,i,1:numCov]<-beta

			# update delta
			randIndex<-sample(numCandid,numCandid)
			for (kk in 1:numCandid)
			{
				s<-randIndex[kk]
				var<-tau*K+diag(-tau+1,nrow(K),ncol(K))

				invVar<-try(solve(var))
				if (class(invVar)=="try-error")
				{
					error<-1
					break
				}
				else
				{
					invVar<-solve(var)
				}

				a<-0.5*log(det(invVar))-0.5*(t(t(Z)-X%*%beta)%*%invVar%*%(t(Z)-X%*%beta))

				deltaTmp<-delta
				deltaTmp[s]<-1-delta[s]
				newK<-updateK(deltaTmp,distance,sampsize,rho)
				newvar<-tau*newK+diag(-tau+1,nrow(newK),ncol(newK))

				newinvVar<-try(solve(newvar))
				if (class(newinvVar)=="try-error")
				{
					error<-1
					break
				}
				else
				{
					newinvVar<-solve(newvar)
				}
					
				b<-0.5*log(det(newinvVar))-0.5*(t(t(Z)-X%*%beta)%*%newinvVar%*%(t(Z)-X%*%beta))

				maxab<-max(a,b)
				if (a==Inf && b<Inf) prob<-delta[s]
				else if (a<Inf && b==Inf) prob<-1-delta[s]
				else if (a<Inf && b<Inf) prob<-exp(a*delta[s]+b*(1-delta[s])-maxab)/(exp(a-maxab)+exp(b-maxab))

				deltaTmp[s]<-rbinom(1,1,prob)
				if (deltaTmp[s]!=delta[s])
				{
					delta[s]<-deltaTmp[s]
					K<-updateK(deltaTmp,distance,sampsize,rho)
				}
			}
			if (error==1) break
			cat("delta ",delta,"\n")
			if (j==ceiling(numItr/2+1) && i==1)
			{
				sumDelta<-rep(0,length(delta))
				kkk<-0
			}
			else if (j>ceiling(numItr/2+1) && i==1)
			{
				sumDelta<-sumDelta+delta
				cat("sumDelta ",sumDelta,"\n")
				kkk<-kkk+1
			}
		}#end numItr
		if (error==1) break

		sumDelta<-sumDelta/kkk

		propDelta<-cbind(seq(1,numCandid),sumDelta)
		orderProp<-propDelta[order(propDelta[,2],decreasing=T),]
		maxDiff<-max(abs(diff(orderProp[,2])))
		VarIndex<-which(abs(diff(orderProp[,2]))==maxDiff)
		VarSelect<-orderProp[1:VarIndex,1]
		VarSelect<-VarSelect[order(VarSelect)]
		
		modelSize<-length(VarSelect)
	}#end chain
	diagnosis1<-gandr.conv(paramArray[,,1])
	list(diagB1=diagnosis1,modelSize=modelSize,propDelta=propDelta,orderProp=orderProp,
		VarSelect=VarSelect)
}

# calculate the kernel matrix (Gaussian kernel)
updateK<-function(delta, distance,sampsize,rho)
{
	K<-matrix(rep(0,sampsize*sampsize),nrow=sampsize)
	for (i in 1:sampsize)
	{
		temp<-(t(distance[,,i])%*%delta)/rho
		K[i,]<-exp(-temp)
	}
	return(K)
}

# The grandr.conv function is to obtain posterior inferences and convergence diagnosis.
# The method for convergence diagnosis is from Bayesian Data Analysis by Gelman, Carlin, 
# Stern, and Rubin. 2003. Taylor & Francis.
# The codes are adapted from the program written by Professor Hal Stern.
# It takes two inputs:
#        n x m matrix of posterior draws for a single parameter of interest
#              (n = number of iterations per chain; m = number of chains)
#        n1 = number of iterations to ignore as transient 
#             (defaults to 0.5*n if no number supplied)
# Output contains 3 pieces:
#      1. an approximate posterior 2.5,50,97.5% points (t-approximation)
#      2. quantiles obtained by treating (n - n1) x m matrix as samples  
#         from posterior
#      3. Gelman and Rubin's convergence measure R and an upper confidence
#         limit for R (values near 1, say less than 1.1 are desirable)
#

gandr.conv<-function(r, n1 = nrow(r)/2) {
#
#  r: matrix of simulated sequences
#  n1: length of initial transient to ignore (default = 0.5)
#
	alpha <- 0.05	# 95% intervals
	m <- ncol(r)
	x <- r[(n1 + 1):nrow(r),  ]	# part of simulated sequences to process
	n <- nrow(x)	# We compute the following statistics:
#
#  xdot:  vector of sequence means
#  s2:  vector of sequence sample variances (dividing by n-1)
#  W = mean(s2):  within MS
#  B = n*var(xdot):  between MS.
#  muhat = mean(xdot):  grand mean; unbiased under strong stationarity
#  varW = var(s2)/m:  estimated sampling var of W
#  varB = B^2 * 2/(m+1):  estimated sampling var of B
#  covWB = (n/m)*(cov(s2,xdot^2) - 2*muhat*cov(s^2,xdot)):
#                                               estimated sampling cov(W,B)
#  sig2hat = ((n-1)/n))*W + (1/n)*B:  estimate of sig2; unbiased under
#                                               strong stationarity
#  quantiles:  emipirical quantiles from last half of simulated sequences
#
	xdot <- as.vector(col.means(x))
	s2 <- as.vector(col.vars(x))
	W <- mean(s2)
	B <- n * var(xdot)
	muhat <- mean(xdot)
	varW <- var(s2)/m
	varB <- (B^2 * 2)/(m - 1)
	covWB <- (n/m) * (cov(s2, xdot^2) - 2 * muhat * cov(s2, xdot))
	sig2hat <- ((n - 1) * W + B)/n
	quantiles <- quantile(as.vector(x), probs = c(0.025, 0.25, 0.5, 0.75, 
		0.975))
	if(W > 1e-08) {
#
# non-degenerate case
# Posterior interval post.range combines all uncertainties
# in a t interval with center muhat, scale sqrt(postvar), 
# and postvar.df degrees of freedom.
#
#       postvar = sig2hat + B/(mn):  variance for the posterior interval
#                               The B/(mn) term is there because of the
#                               sampling variance of muhat.
#       varpostvar:  estimated sampling variance of postvar
#
		postvar <- sig2hat + B/(m * n)
		varpostvar <- (((n - 1)^2) * varW + (1 + 1/m)^2 * varB + 
                        2 * (n - 1) * (1 + 1/m) * covWB)/n^2
		post.df <- chisqdf(postvar, varpostvar)
		if (post.df<2) post.df=2.0001
		post.range <- muhat + 
                              sqrt(postvar) * qt(1-alpha/2, post.df) * c(-1,0,1)
#
# Estimated potential scale reduction (that would be achieved by
# continuing simulations forever) has two components:  an estimate and
# an approx. 97.5% upper bound.
#
# confshrink = sqrt(postvar/W), 
#     multiplied by sqrt(df/(df-2)) as an adjustment for the
#     width of the t-interval with df degrees of freedom.
#
# postvar/W = (n-1)/n + (1+1/m)(1/n)(B/W); we approximate the sampling dist.
# of (B/W) by an F distribution, with degrees of freedom estimated
# from the approximate chi-squared sampling dists for B and W.  (The
# F approximation assumes that the sampling dists of B and W are independent;
# if they are positively correlated, the approximation is conservative.)
#
		varlo.df <- chisqdf(W, varW)
		confshrink.range <- sqrt((c(postvar/W, (n - 1)/n + (1 + 1/m) * 
                       (1/n) * (B/W) * qf(0.975, m - 1, varlo.df)) * post.df)/
                       (post.df - 2))
		list(quantiles = quantiles, confshrink = 
			confshrink.range)
	}
	else {
#
# degenerate case:  all entries in "data matrix" are identical
#
		list(post = muhat * c(1, 1, 1), quantiles = quantiles, 
			confshrink = c(1, 1)) }
}
#
# some functions needed by the above
#
col.vars<-function(mat) {
	means <- col.means(mat)
	col.means(mat * mat) - means * means }
#
col.means<-function(mat) {
	ones <- matrix(1, nrow = 1, ncol = nrow(mat))
	ones %*% mat/nrow(mat) }
#
cov<-function(a, b) {
	m <- length(a)
	((mean((a - mean(a)) * (b - mean(b)))) * m)/(m - 1) }
#
chisqdf<-function(A, varA) {
	2 * (A^2/varA) }

data<-read.table("C:\\Documents and Settings\\hzhang\\My Documents\\papers\\Arnab.Var.Selection\\paper\\SupplementalMaterial\\Probit.Example3.txt", header=F)
Y<-data[,2]
X<-data[,3]
Candid<-data[,4:ncol(data)]

VarSelRst<-VarSel(Y,X,Candid,numCov=1,rho=1,tau=0.8,numItr=200,numChain=2,sigBeta=10)

