#!/usr/bin/Rscript --silent

DISTPATH = Sys.getenv(x = "DEEP_READMIX_PATH", unset = "/usr/share/readmix", names = TRUE)

cat("Loading required packages...\n")
pkgrequire("lpSolve")
pkgrequire("optparse")
source(paste(DISTPATH, "/R/readmix_deep.r", sep = ""))

option_list <- list(
	make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
	help="Print extra output [default=false]"),
	make_option(c("-g", "--noguesstoanswer"), action="store_true", default=FALSE,
	help="guess.to.answer [default=true]"),
	make_option(c("-s", "--skipiterative"), action="store_true", default=FALSE,
	help="skip.iterative [default=false]"),
	make_option(c("-b", "--buildrandom"), action="store_true", default=FALSE,
	help="build.random [default=false]"),
	make_option(c("-c", "--scaleguess"), action="store_true", default=FALSE,
	help="scale.guess [default=false]"),
	make_option(c("-e", "--restrictpops"), action="store_true", default=FALSE,
	help="restrict.pops [default=false]"),
	make_option(c("-a", "--sampleguess"), action="store_true", default=FALSE,
	help="sample.guess [default=false]"),
	make_option(c("-l", "--skipsimilar"), action="store_true", default=FALSE,
	help="skip.similar [default=false]"),
	make_option(c("-w", "--skipswap"), action="store_true", default=FALSE,
	help="skip.swap [default=false]"),
	make_option(c("-y", "--dry"), action="store_true", default=FALSE,
	help="DRY [default=false]"),
	make_option(c("-d", "--rmxdata"), type="integer", default=1,
	help="rmxdata [default %default]", metavar="number"),
	make_option(c("-r", "--seedrng"), type="integer", default=32267,
	help="seedrng [default %default]", metavar="number"),
	make_option(c("-i", "--minaccepted"), type="integer", default=1,
	help="min.accepted [default %default]", metavar="number"),
	make_option(c("-o", "--gmaxiter"), type="integer", default=0,
	help="Gmax, by default it is calculated automaticaly", metavar="number"),
	make_option(c("-k", "--minadmcoef"), type="double", default=0.001,
	help = "min.adm.coef [default %default]"),
	make_option(c("-x", "--maxadmcoef"), type="double", default=0.999,
	help = "max.adm.coef [default %default]"),
	make_option(c("-z", "--minadmvec"), type="double", default=-0.025,
	help = "min.adm.vec [default %default]"),
	make_option(c("-u", "--betauser"), type="double", default=0.65,
	help = "beta_user [default %default]"),
	make_option(c("-f", "--remcutoff"), type="double", default=-0.01,
	help = "rem.cutoff [default %default]"),
	make_option(c("-t", "--maxerror"), type="double", default=0.03,
	help = "max.error [default %default]"),
	make_option(c("-p", "--stablefrac"), type="double", default=0.75,
	help = "stablefrac [default %default]")
)
opt_parser <- OptionParser(usage = "usage: %prog [options]", option_list = option_list, description = "reAdmix 2", epilogue = "Send feedback to mackoel@gmail.com")
flaggs <- parse_args(opt_parser, args = commandArgs(trailingOnly = TRUE), print_help_and_exit = TRUE, positional_arguments = TRUE)
opts <- flaggs$options
args <- flaggs$args

#args <- commandArgs(TRUE)

USAGE_STRING = "============= ReAdMix with DEEP Script ========================
Usage:

$ deep-admixture.R <reference> <input> <job_id> <out_dir> [<cond-pop> [<num_repl> [<subset_size> [<keep.init>]]]]

A bunch of options are read (sourced) from file <job_id>.properties 
if it exists

Arguments:
1 Reference file - a CSV file with mean population admixture
2 Input fie
3 Job ID
4 Output directory
5 Initial populations file  - use \"\" for none
6 Number of replications (default 2)
7 Subset size ( desired number of populations in the solution) default 4
8 keep.init = Y|N

The default number of candidate solutions is 2 * (subsetSize + 1)
in each of number of replications runs. All of them are analyzed together to
determine stable populations.

To use standard deviations for reference
place a file with name 
      (your_reference_file_name).sdev 
in the same directory where the reference is.

To use 3rd step of the Phase2 (local search within similar populations),
place a file named 
      (your_reference_file_name).clust 
in the same directory.
*************************************************************************\n
"
if(length(args) < 4){
	cat(USAGE_STRING)
	stop("Please supply valid arguments\n")
}

# Argument parsing

# Reference file : a csv file with (mean) admixture proportions of known populations
REFERENCE_FILE = args[[1]] # 'Harappa1.csv'

#  Additional file: errors for reference:
REF_SD_FILE = paste(args[[1]], ".sdev", sep="")

# Additional file: clustering for reference
REF_CLUSTER_FILE = paste(args[[1]], ".clust", sep="")


# Infile: file that has the test sample as its second row (the first row is the header naming the ancestral populations)
INFILE = args[[2]]

# ID for the job, arbitrary string
ID = args[[3]]

# Directory where to put the result file
OUT_DIR = args[[4]]  #"output_files"
scrf <- file.path(OUT_DIR, ID)
if(!file.exists(scrf)) dir.create(scrf, rec=TRUE)

# Populations to begin with
PRESENT_POPS_FILE = ""
keep.init = TRUE
cond = "free"
if(length(args) > 4) {
	optval <- args[[5]]
	if (optval == "equal") {
		cond = "equal"
	} else if (optval == "free") {
		cond = "free"
	} else if (optval != "NONE") {
		PRESENT_POPS_FILE = args[[5]]
	}
}

if( PRESENT_POPS_FILE != "" && file.exists(PRESENT_POPS_FILE) && length(readLines(PRESENT_POPS_FILE)) > 0 ) {
	CONDITIONAL = TRUE
	present_pops = read.table(PRESENT_POPS_FILE, stringsAsFactors = FALSE)
	cat(rownames(t), "Reading starting set of populations from ", PRESENT_POPS_FILE, "...\n")
	if (ncol(present_pops) != 1) {
		stop("Error in the populations file: there should be one population per row \n")
	}
	present_pops = present_pops[[1]]  # convert to vector
	comments = grep("^#", present_pops)
	if (length(comments) > 0) {
		present_pops = present_pops[-comments]
	}
	keep.init = rep(TRUE, length(present_pops) )
	tentative = grep("^[?]", present_pops)
	if (length(tentative) > 0) {
		present_pops[tentative] = gsub( "^[?]", "", present_pops[tentative])
		keep.init[tentative] = FALSE
	}
} else {
	present_pops = integer(0)
	CONDITIONAL = FALSE
}

if (length(args) > 5) {
	N_HINT = as.integer(args[[6]])
} else N_HINT = 2

# Reordering columns and converting from percentages
prep_input <- function(m) {
	m = as.matrix(m)
	rsums <- rowSums(m)
	if (any(rsums <= 0)) stop("Bad input.")
	m <- m/rsums
	m
}

R = read.csv(REFERENCE_FILE, row.names = 1)
R <- prep_input(R)
ref.pops.K <- dim(R)[2]
# Reference population errors (sdev)
Rsd = NULL
if (file.exists(REF_SD_FILE)) {
	Rsd = read.csv(REF_SD_FILE, row.names=1)
	Rsd[is.na(Rsd)]=0
	Rsd = as.matrix(Rsd)
	if (!all(dim(Rsd) == dim(R))) {
		cat(rownames(t), "Reference stdev table size (", dim(Rsd),  ")does not match that of reference means", dim(R), ".\n")
		stop()
	}
} else {
	cat(rownames(t), "Proceeding without reference error data.\n")
}

if (file.exists(REF_CLUSTER_FILE)) {
	R_cl = read.table(REF_CLUSTER_FILE, stringsAsFactors = F)
	if (ncol(R_cl) > 1) {
		rownames(R_cl) = R_cl[,1]
		R_cl = (R_cl[,2, drop=TRUE])
	} else {
		R_cl = (R_cl[,1, drop=TRUE])
	}
} else {
	R_cl = NULL
}

test = read.csv(INFILE, row.names = 1, header = TRUE) 
test <- prep_input(test)
print(test)

cat(" \n\n")
cat("Starting.. .\n");

# t = test[1, , drop = FALSE]
test_group = test[ , , drop = FALSE]
if (CONDITIONAL) {
	subsetSize = length(present_pops) + 2
	init_rows = present_pops
} else {
	init_rows = integer(0)
	subsetSize = 4
}

if (length(args) >= 7 )	{
	optval <- as.integer(args[[7]])
	if (optval > 0) subsetSize = optval
}

if (length(args) >= 8 )	{
	optval <- args[[8]]
	if (optval == "Y") {
		keep.init = TRUE
	} else if (optval == "N") {
		keep.init = FALSE
	}
}

# For DEEP we need at least 2*(subsetSize + 1)
# solutions in the set (N_DE/NMETA)
# in order to be able to recombine.
# We also can use NMETA random seeds.
# Consequently N_HINT is mapped to NMETA,
# N_DE = 2*(subsetSize + 1)
# All of them could be overwritten in
# properties file.
# That is done to keep the same command line.

NMETA <- N_HINT
guess.to.answer <- !opts$noguesstoanswer
skip.iterative <- opts$skipiterative
build.random <- opts$buildrandom
scale.guess <- opts$scaleguess
restrict.pops <- opts$restrictpops
sample.guess <- opts$sampleguess
skip.similar <- opts$skipsimilar
skip.swap <- opts$skipswap
DRY <- opts$dry
verbose <- opts$verbose
rmxdata <- opts$rmxdata
seed.rng <- opts$seedrng
min.adm.coef <- opts$minadmcoef
max.adm.coef <- opts$maxadmcoef
min.adm.vec <- opts$minadmvec
beta_user <- opts$betauser
min.accepted <- opts$minaccepted
rem.cutoff <- opts$remcutoff
max.error <- opts$max.error
stable.frac <- opts$stablefrac
QUANT <- max(2 * (subsetSize + 1), 10)
N_DE <- QUANT * NMETA
search.iter <- max(14, floor(0.03 * dim(R)[1] + 0.5))
if (opts$gmaxiter > 0) {
	search.iter <- opts$gmaxiter
}

set.seed(seed.rng)
cat(ID, " ref.pops.K = ", ref.pops.K, "\n")
result <- group_readmix_deep(
			N_DE, 
			R, 
			test_group, 
			init_rows = init_rows,
			subsetSize = subsetSize, 
			search.iter = search.iter,
			min.adm.coef = min.adm.coef,
			max.adm.coef = max.adm.coef,
			min.adm.vec = min.adm.vec,
			min.accepted = min.accepted,
			beta_user = beta_user,
			rem.cutoff = rem.cutoff,
			max.error = max.error,
			stable.frac = stable.frac,
			keep.init = keep.init, 
			ref.err = Rsd, R_cl = R_cl,
			log.dir = scrf,
			rmxdata = rmxdata,
			rmxref = REFERENCE_FILE,
			guess.to.answer = guess.to.answer,
			skip.iterative = skip.iterative,
			build.random = build.random,
			scale.guess = scale.guess,
			sample.guess = sample.guess,
			restrict.pops = restrict.pops,
			skip.similar = skip.similar,
			skip.swap = skip.swap,
			N_META = NMETA,
			cond = cond,
			dry = DRY,
			verbose = verbose,
			job_id = ID
		)


OUT_FILE = file.path(OUT_DIR, paste0("result-", ID, ".csv"))

OUT_TEXT_FILE = file.path(OUT_DIR, paste0("prediction-",ID,".csv"))

OUT_MEANS_FILE = file.path(OUT_DIR, paste0("means-",ID,".csv"))

write.csv(file=OUT_TEXT_FILE, result$pred)
write.csv(file=OUT_FILE, result$ans)

meancoef <- colMeans(result$ans)
summ <-as.data.frame(meancoef[order(-meancoef)])
colnames(summ) <- "mean coef"
write.csv(file=OUT_MEANS_FILE, summ)



