#!/bin/bash
set -eo pipefail

# Declare inputs
# allow multiple input for fasta, extra seq and GTF
Reference_Genome=()
Extra_Sequences=()
Annotation_Files=()
Transcription_Factor_Motif_PFMs=()
Mitochondrial_Contigs=()
Disable_Biotype_Filtering=false
Disable_Readthrough_Filtering=false
Filter_PARs=false
WTA_Only_Index=false
Archive_Prefix=nil
Maximum_Threads=$(nproc)
Extra_STAR_PARAMS=''
declare -a all_inputs=()

while [[ "$#" -gt 0 ]]; do
	case $1 in
		--reference-genome)
		shift
			while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
				Reference_Genome+=("$1")
				all_inputs+=("$1")
				shift
			done
			;;
		--extra-sequences)
		shift
			while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
				Extra_Sequences+=("$1")
				all_inputs+=("$1")
				shift
			done
			;;
		--gtf)
			shift
			while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
				Annotation_Files+=("$1")
				all_inputs+=("$1")
				shift
			done
			;;
		--transcription-factor-motif-pfm)
			shift
			while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
				Transcription_Factor_Motif_PFMs+=("$1")
				all_inputs+=("$1")
				shift
			done
			;;
		--maximum-threads)
			Maximum_Threads="$2"
			shift 2
			;;
		--disable-biotype-filtering)
			Disable_Biotype_Filtering=true
			shift
			;;
		--disable-readthrough-filtering)
			Disable_Readthrough_Filtering=true
			shift
			;;
		--filter-PARs)
			Filter_PARs=true
			shift
			;;
		 --wta-only-index)
		 	WTA_Only_Index=true
		 	shift
		 	;;
		--archive-prefix)
			Archive_Prefix="$2"
			shift 2
			;;

		--extra-star-params)
			Extra_STAR_PARAMS="$2"
			# Remove quotes from the beginning and end of the string
			Extra_STAR_PARAMS="${Extra_STAR_PARAMS#\'\'}"
			Extra_STAR_PARAMS="${Extra_STAR_PARAMS%\'\'}"
			shift 2
    		;;
        --mitochondrial-contigs)
            shift
            while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
                Mitochondrial_Contigs+=("$1")
                shift
            done
            ;;
		*) echo "Unknown param passed: $1"; exit 1 ;;

	esac
	#shift
done


echo "******************************  Running Rhapsody Reference Generator *********************************"
echo "Reference_Genome file: ${Reference_Genome[@]}"
echo "Extra sequence file: ${Extra_Sequences[@]}"
echo "GTF is ${Annotation_Files[@]}"
echo "Transcription Factor Motif PFM is ${Transcription_Factor_Motif_PFMs[@]}"
echo "Mitochondrial Contigs is ${Mitochondrial_Contigs[@]}"
echo "Disable Biotype filtering is $Disable_Biotype_Filtering"
echo "Disable Readthrough filtering is $Disable_Readthrough_Filtering"
echo "Filtering PARs is $Filter_PARs"
echo "Generating WTA index only is $WTA_Only_Index"
echo "Output archive prefix is $Archive_Prefix"
echo "Maximum_Threads is $Maximum_Threads"


workdir=ref_build_temp
mkdir $workdir
cd $workdir


# Unzip all input files if they were zipped, iterating through all available inputs,
#  then swap any input that needs to be decompressed to the decompressed version in workdir

# Decompressed list for removal of intermediate files
decom_list=()
# Init new array for counting purpose
GTF_inputs=()
fasta_inputs=()
extra_seq=()
pfm_files=()
# Remove pipes, colons, and hashtags from contig names
illegal_characters_regex='[|#:]'
replacement_string='_'
awk_command='BEGIN{FS=OFS="\t"} !/^#/ {gsub(/'"${illegal_characters_regex}"'/, "'"${replacement_string}"'", $1)} {print}'
sed_command='/^>/ s/'"${illegal_characters_regex}"'/'"${replacement_string}"'/g'
for file in "${all_inputs[@]}"; do
	if [[ " ${Annotation_Files[@]} " =~ " ${file} " ]]; then
		# remove $file element from array
		Annotation_Files=("${Annotation_Files[@]/$file}")
		if [[ $file == *.gz ]]; then
			# Add decompressed file name to array
			workingname="$(basename "${file%%.gz}")"
			# add to decompressed array
			decom_list+=("${workingname}")
			echo "Decompressing $file and removing pipes, colons, and hashtags from chromosome names..."
			gunzip -c $file | awk "${awk_command}" > ${workingname}
		else
			workingname="$(basename "${file}")"
			echo "$file is not gzipped, removing pipes, colons, and hashtags from chromosome names..."
			awk "${awk_command}" $file > ${workingname}
		fi
		# Add file basename to array
		Annotation_Files+=("${workingname}")

	elif [[ " ${Reference_Genome[@]} " =~ " ${file} " ]]; then
		Reference_Genome=("${Reference_Genome[@]/$file}")
		if [[ $file == *.gz ]]; then
			workingname="$(basename "${file%%.gz}")"
			decom_list+=("${workingname}")
			echo "Decompressing $file and removing pipes, colons, and hashtags from FASTA headers..."
			gunzip -c "$file" | sed -e "${sed_command}" > "${workingname}"
		else
			workingname="$(basename "${file}")"
			echo "$file is not gzipped, removing pipes, colons, and hashtags from FASTA headers..."
			# Apply sed to replace forbidden characters only in FASTA headers for non-gzipped files
			sed -e "${sed_command}" "$file" > "${workingname}"
		fi
		Reference_Genome+=("${workingname}")

	elif [[ " ${Extra_Sequences[@]} " =~ " ${file} " ]]; then
		Extra_Sequences=("${Extra_Sequences[@]/$file}")
		if [[ $file == *.gz ]]; then
			workingname="$(basename "${file%%.gz}")"
			decom_list+=("${workingname}")
			echo "Decompressing $file and removing pipes, colons, and hashtags from FASTA headers..."
			gunzip -c "$file" | sed -e "${sed_command}" > "${workingname}"
		else
			workingname="$(basename "${file}")"
			echo "$file is not gzipped, removing pipes, colons, and hashtags from FASTA headers..."
			# Apply sed to replace forbidden characters only in FASTA headers for non-gzipped files
			sed -e "${sed_command}" "$file" > "${workingname}"
		fi
		Extra_Sequences+=("${workingname}")

	elif [[ " ${Transcription_Factor_Motif_PFMs[@]} " =~ " ${file} " ]]; then
		Transcription_Factor_Motif_PFMs=("${Transcription_Factor_Motif_PFMs[@]/$file}")
		if [[ $file == *.gz ]]; then
			workingname="$(basename "${file%%.gz}")"
			decom_list+=("${workingname}")
			echo "Decompressing $file..."
			gunzip -c "$file" > "${workingname}"
		else
			workingname="$(basename "${file}")"
			echo "$file is not gzipped, copying to working directory..."
			cp "$file" "${workingname}"
		fi
		Transcription_Factor_Motif_PFMs+=("${workingname}")

	else
		echo 'Invalid Inputs'
		exit 1
	fi
done

# Move to a new array for GTFs to avoid the "ghost element" causeby by array[@]/element
for item in "${Annotation_Files[@]}"; do
	[[ -n "$item" ]] && GTF_inputs+=("$item")
done
for item in "${Reference_Genome[@]}"; do
	[[ -n "$item" ]] && fasta_inputs+=("$item")
done
for item in "${Extra_Sequences[@]}"; do
	[[ -n "$item" ]] && extra_seq+=("$item")
done
for item in "${Transcription_Factor_Motif_PFMs[@]}"; do
	[[ -n "$item" ]] && pfm_files+=("$item")
done

# Form the output gtf file name (can be a possible concatenation of multiple gtf and fasta files)
gtf_for_star=""
file_name_limit=200
# Combine GTF_inputs and extra_seq arrays
all_files=("${GTF_inputs[@]}" "${extra_seq[@]}")
for file in "${all_files[@]}"; do
  # Remove file extension - e.g. ".gtf", ".fa", ".fna" or ".fasta" prefix
  filename=$(basename "$file")
  file="${filename%.*}"

  # Check if adding the next file prefix exceeds the character limit
  if (( ${#gtf_for_star} + ${#file} + 1 >= $file_name_limit )); then
    break
  fi

  # Concatenate the name with a hyphen separator
  if [[ -z "$gtf_for_star" ]]; then
    gtf_for_star="$file"
  else
    gtf_for_star="$gtf_for_star-$file"
  fi
done

# Concatenate and filter gtf feature rows / Just Concatenate
gtf_for_star="$gtf_for_star-processed.gtf"
filtering_flags=""
if [[ $Disable_Biotype_Filtering == true ]]; then
  filtering_flags="--disable-biotype-filtering"
fi
if [[ $Disable_Readthrough_Filtering == true ]]; then
  filtering_flags+=" --disable-readthrough-filtering"
fi
if [[ $Filter_PARs == true ]]; then
  filtering_flags+=" --filter-PARs"
fi


# Combine and Filter gtf features to 1 file for STAR
echo $'\nRunning mist_annotation_filter.py...\n'
# Note: Below, we are redirecting stderr to stdout but the original stdout itself gets written to the $gtf_for_star file - This is done because the rest of the program is mainly logging to stdout
if [[ ${#Extra_Sequences[@]} != 0 ]] ; then
	mist_annotation_filter.py $filtering_flags --input-gtfs "${GTF_inputs[@]}" --extra-fastas "${extra_seq[@]}" --log-level INFO 2>&1 1> $gtf_for_star
else
	mist_annotation_filter.py $filtering_flags --input-gtfs "${GTF_inputs[@]}" --log-level INFO 2>&1 1> $gtf_for_star
fi

# Run STAR genome generate
echo $'\nBuilding STAR index...\n'
if [[ ${#Extra_Sequences[@]} != 0 ]] ; then
	STAR --runMode genomeGenerate --genomeDir star_index --genomeFastaFiles ${fasta_inputs[@]} ${extra_seq[@]} --runThreadN $Maximum_Threads --sjdbGTFfile $gtf_for_star ${Extra_STAR_PARAMS}
else
	STAR --runMode genomeGenerate --genomeDir star_index --genomeFastaFiles ${fasta_inputs[@]} --runThreadN $Maximum_Threads --sjdbGTFfile $gtf_for_star ${Extra_STAR_PARAMS}
fi

# # If not WTA_Only_Index, run bwa-mem2
if [[ $WTA_Only_Index != true ]]; then
	# Write Mitochondrial Contigs to a file
	mitochondrial_contigs_file="mitochondrial_contigs.txt"
	# Remove pipes, underscores, colons, and hashtags from contig names
	sanitized_mitochondrial_contigs=()
	for contig in "${Mitochondrial_Contigs[@]}"; do
		cleaned_contig=$(echo "$contig" | sed "s/${illegal_characters_regex}/${replacement_string}/g")
		sanitized_mitochondrial_contigs+=("$cleaned_contig")
	done
	printf "%s\n" "${sanitized_mitochondrial_contigs[@]}" > "$mitochondrial_contigs_file"

	# Concatenate any Transcription Factor Motif PFMs into one file that uses ".pfm" as suffix
	if [[ ${#pfm_files[@]} -gt 0 ]]; then
		echo "Concatenating ${pfm_files[@]}"
		# Create a common prefix
		prefixes=()
		for pfm_file in "${pfm_files[@]}"; do
			prefix=$(basename "${pfm_file%.*}")
			prefixes+=("$prefix")
		done
		pfm_prefix=$(IFS="."; echo "${prefixes[*]}")
		concatenated_pfm_file="${pfm_prefix}.pfm"
		cat "${pfm_files[@]}" > "${concatenated_pfm_file}"
	fi

	echo $'\nBuilding bwa-mem2 index...\n'
	# Create a common prefix
	all_fasta_files=("${fasta_inputs[@]}" "${extra_seq[@]}")
	prefixes=()
	for fasta_file in "${all_fasta_files[@]}"; do
		prefix=$(basename "${fasta_file%.*}")
		prefixes+=("$prefix")
	done
	mem_prefix=$(IFS="."; echo "${prefixes[*]}")
	# Select bwa-mem2 binary to use
	bwa_mem2_bin=$(python3 -c "from mist.apps.atac.atac_align_reads import select_bwa_mem2_bin; print(select_bwa_mem2_bin())")
	# Generate the index using bwa-mem2 and move over the necessary files under a separate directory
	cmd="mkdir -p bwa-mem2_index && $bwa_mem2_bin index -p $mem_prefix <(cat ${all_fasta_files[@]})"
	echo $cmd
	eval $cmd
	find . -maxdepth 1 -type f -name "$mem_prefix*" ! -iregex '.*\.\(fa\|fasta\|fna\|gtf\)$' -exec mv {} bwa-mem2_index/ \;
else
	echo "Only generating STAR index for WTA, skipping bwa-mem2 index for ATAC-seq."
fi
echo $'\n'


# determine if SA index exists;
FILE=star_index/SAindex
if test -f "$FILE"; then
	echo "$FILE exists; proceeding to compression."
else
	echo "$FILE does not exist; exiting..."
	exit 1
fi

cd ..

# Create the final directory and move the items we want into it
finaldir=BD_Rhapsody_Reference_Files
mkdir $finaldir

mv "${workdir}/star_index/" ${finaldir}
mv "${workdir}/${gtf_for_star}" ${finaldir}
if [[ $WTA_Only_Index != true ]]; then
	mv "${workdir}/${mitochondrial_contigs_file}" ${finaldir}
	mv "${workdir}/bwa-mem2_index/" ${finaldir}
	if [[ ${#Transcription_Factor_Motif_PFMs[@]} -gt 0 ]]; then
		mv "${workdir}/${concatenated_pfm_file}" ${finaldir}
	fi
	# Process (concatenate, compress, index) unique input FASTA files used for STAR index
	echo "Processing unique input FASTA files used for STAR index..."
	all_star_fasta_paths_with_dups=("${fasta_inputs[@]}" "${extra_seq[@]}")
	unique_star_fasta_paths=()
	declare -A seen_paths_for_copy # Use associative array to track seen paths

	for p in "${all_star_fasta_paths_with_dups[@]}"; do
	if [[ -n "$p" && -z "${seen_paths_for_copy[$p]}" ]]; then # If path is non-empty and not seen
		unique_star_fasta_paths+=("$p")
		seen_paths_for_copy["$p"]=1 # Mark path as seen
	fi
	done

	# Prepare lists for actual source paths and basenames for naming
	declare -a resolved_fasta_source_paths_for_cat=()
	declare -a fasta_original_basenames_for_naming=()

	for item_from_unique_paths in "${unique_star_fasta_paths[@]}"; do
		# All processed FASTA files are located in the workdir.
		# The 'item_from_unique_paths' is the basename of the processed file.
		actual_path_to_source_file="${workdir}/${item_from_unique_paths}"

		if [[ -f "$actual_path_to_source_file" ]]; then
			resolved_fasta_source_paths_for_cat+=("$actual_path_to_source_file")
			fasta_original_basenames_for_naming+=("$(basename "$item_from_unique_paths")")
		else
			echo "Error: FASTA file $actual_path_to_source_file (derived from unique path $item_from_unique_paths) not found during preparation. Exiting." >&2
			exit 1
		fi
	done

	if [[ "${#resolved_fasta_source_paths_for_cat[@]}" -gt 0 ]]; then
		# Determine the final output FASTA filename stem
		final_fasta_stem=""
		if [[ "${#fasta_original_basenames_for_naming[@]}" -eq 1 ]]; then
			base_for_name="${fasta_original_basenames_for_naming[0]}"
			final_fasta_stem="${base_for_name%.*}" # Remove original extension
		else
			file_name_limit=200 # Character limit for the combined name
			for original_basename in "${fasta_original_basenames_for_naming[@]}"; do
				stem_part="${original_basename%.*}" # Remove original extension
				if (( ${#final_fasta_stem} + ${#stem_part} + 1 >= $file_name_limit && ${#final_fasta_stem} > 0 )); then
					break
				fi
				if [[ -z "$final_fasta_stem" ]]; then
					final_fasta_stem="$stem_part"
				else
					final_fasta_stem="$final_fasta_stem-$stem_part"
				fi
			done
		fi
		final_fastagz_name="${final_fasta_stem}.fasta.gz"

		# Determine the input for bgzip (either single file or concatenated file)
		input_for_bgzip=""
		temp_concat_fasta=""
		if [[ "${#resolved_fasta_source_paths_for_cat[@]}" -eq 1 ]]; then
			input_for_bgzip="${resolved_fasta_source_paths_for_cat[0]}"
		else
			temp_concat_fasta="${workdir}/combined_input_references.fasta"
			echo "Concatenating ${#resolved_fasta_source_paths_for_cat[@]} FASTA files into $temp_concat_fasta"
			cat "${resolved_fasta_source_paths_for_cat[@]}" > "$temp_concat_fasta"
			input_for_bgzip="$temp_concat_fasta"
		fi

		echo "Compressing $input_for_bgzip to ${finaldir}/${final_fastagz_name} using bgzip"
		bgzip -i -c "$input_for_bgzip" > "${finaldir}/${final_fastagz_name}"
		echo "Indexing ${finaldir}/${final_fastagz_name} with samtools faidx"
		samtools faidx "${finaldir}/${final_fastagz_name}"

		if [[ -n "$temp_concat_fasta" && -f "$temp_concat_fasta" ]]; then
			rm "$temp_concat_fasta"
		fi
	else
		echo "Error: No valid FASTA files found to process for final output."
		exit 1
	fi
fi


# Determine final output filename
if [[ -n "$Archive_Prefix" && "$Archive_Prefix" != "nil" ]]; then
	final_file_output_name="${Archive_Prefix}_Rhap_reference.tar.gz"
else
	echo "Empty archive-prefix, using default name"
	final_file_output_name="Rhap_reference.tar.gz"
fi

echo "Compressing into final output: $final_file_output_name"
tar -cvf $final_file_output_name $finaldir --use-compress-program pigz
rm -rf $finaldir $workdir
