#!/bin/bash
set -euo pipefail

if test $# != 3 ;then
  echo "Number of arguments is not 3.
    Usage: bash ./AssembleAndAnnotate.sh <Input_Compressed> <Read_Limit> <VDJ_Version>
        Input_Compressed: tar.gz directory that contains fastq files that will be run with the assembler.
        Read_Limit: read limit per cell to down sample the per cell fastq file in Input_Compressed; down sampled files will be used in assembly step. Default is '0' indicating no read limit is set.
        VDJ_Version: VDJ_Version from pipeline inputs
    " 1>&2
   exit
fi

Input_Compressed=$1
Read_Limit=$2
VDJ_Version=$3

tar -zxf "${Input_Compressed}"
Input_Dir_Basename=$(basename "${Input_Compressed}")
Input_Directory=${Input_Dir_Basename%%.*}

echo "AssembleAndAnnotate Input_Directory: "${Input_Directory} 1>&2


echo "Starting perCellChain assembly" 1>&2

IFS='_' read -r -a namearray <<< "${Input_Directory}"
samplename=${namearray[0]}
vdjtype=${namearray[4]}
split_idx=${namearray[5]}
if [ ${vdjtype} = "TCR" ]; then
    vdjtype=TCR
    igblast_vdjtype=TCR
else
    vdjtype=BCR  # Assume it is "IG"
    igblast_vdjtype=Ig
fi

contig_output_name="${samplename}_${vdjtype}_Final_Contigs_Number_${split_idx}"
touch "${contig_output_name}.fasta"


for cellFastqFile in $(find ${Input_Directory} -name '*.fastq')
do
  echo " " 1>&2
  echo "-- Assembling file: ${cellFastqFile} --" 1>&2

  filename=$(basename -- "${cellFastqFile}")
  filename=${filename%%.*}
  numReadsFastq=$(( $(wc -l < "${cellFastqFile}") / 4 ))
  assemblyInputFastq=${cellFastqFile}

  # Check if we need to downsample, based on the read limit
  if [ $Read_Limit -eq 0 ]; then
    #echo skip downsampling because no read limit was set
    true
  elif test ${numReadsFastq} -gt $Read_Limit; then
    echo "    Randomly downsampling ${cellFastqFile} to ${Read_Limit} records per cell-chain" 1>&2
    seqtk sample -s100 "${cellFastqFile}" "$Read_Limit" > "${cellFastqFile}_sub.fastq"
    assemblyInputFastq=${cellFastqFile}_sub.fastq
  else
    #echo skip downsampling because ${cellFastqFile} has less than $Read_Limit records
    true
  fi

  echo "    Starting assembly using Inchworm" 1>&2
  # Inchworm needs a fasta input
  inchwormInputFasta=$(basename "${assemblyInputFastq%.fastq}.fasta")
  seqtk seq -A "${assemblyInputFastq}" > "${inchwormInputFasta}"
  # Set params - lower leniency for lower read depth
  nopruneFlag=""
  minKmerCount=1
  minSeedCoverage=2
  minAssemblyCoverage=2
  minLength=145
  kmerSize=18
  if [ "$numReadsFastq" -lt 1000 ]; then
    minAssemblyCoverage=1
    nopruneFlag="--no_prune_error_kmers"
  fi
  if [ "$numReadsFastq" -lt 600 ]; then
    minSeedCoverage=1
  fi
  # Output contigs file
  contigFilePath="${filename}_contigs.fa"
  # Run
  inchworm \
    --reads "${inchwormInputFasta}" \
    --run_inchworm \
    -K "${kmerSize}" \
    --min_assembly_coverage "${minAssemblyCoverage}" \
    -L "${minLength}" \
    --minKmerCount "${minKmerCount}" \
    ${nopruneFlag} \
    --num_threads 1 \
    --min_seed_coverage "${minSeedCoverage}" \
    > "${contigFilePath}" 2>> assembly.log.txt

  # Check if there is inchworm output, and that it isn't empty.  Copy it to the output directory, else, use the input file as the contigs
  extendContigs=false
  if [ -s ${contigFilePath} ]; then
    # Replace ';' separator with '_' for correct parsing of Supplementary alignment tag (SA)
    sed -i 's/;/_/g' "$contigFilePath"
    if [ ${numReadsFastq} -lt 10000 ];
    then
      extendContigs=true
    fi
    echo "    Inchworm result found" 1>&2
  else
    contigFilePath=${inchwormInputFasta}
    echo "    No Inchworm result found - using input" 1>&2
  fi

  echo "    Starting cell level alignment, counting and contig refinement" 1>&2
  minimap2 -a -t 1 -x sr "${contigFilePath}" "${assemblyInputFastq}" > "${filename}_aln.sam" 2>> assembly.log.txt

  # Refine and extend contig ends using read alignment information (if needed). Also formats fasta record names with read and umi information as needed for downstream steps.
  (
    VDJContigRefiner \
    "${contigFilePath}" \
    "${filename}_aln.sam" \
    "${numReadsFastq}" \
    "${Read_Limit}" \
    "${extendContigs}" \
      | seqtk seq -r >> ${contig_output_name}.fasta
  ) 2>> assembly.log.txt

  rm -f "${filename}_aln.sam"
  rm -f "${inchwormInputFasta}"
  rm -f "${cellFastqFile}_sub.fastq"
  rm -f "${contigFilePath}"

done

echo "Finished perCellChain assembly" 1>&2

echo "Adding single read contigs into final contigs" 1>&2
if ls "${Input_Directory}"/single_read_contigs* 1> /dev/null 2>&1; then
    cat "${Input_Directory}"/single_read_contigs* >> "${contig_output_name}.fasta"
fi

# Select the indicies for igblast, based on species. Assume human, unless mouse is specified
igblast_species=human
if [ ${VDJ_Version} = "mouse" ] || [ ${VDJ_Version} = "mouseBCR" ] || [ ${VDJ_Version} = "mouseTCR" ]; then
    igblast_species=mouse
fi

echo "Annotating with IGBlast" 1>&2

export IGDATA=${RHAPSODY_HOME}/igblast

igblastn_linux \
    -gapopen 5 -gapextend 2 -num_threads 1 -show_translation -extend_align5end \
    -domain_system imgt -outfmt 19 -min_D_match 5 -strand both \
    -ig_seqtype ${igblast_vdjtype} \
    -germline_db_V ${IGDATA}/${igblast_vdjtype}/${igblast_species}/${igblast_species}_${igblast_vdjtype}_V \
    -germline_db_D ${IGDATA}/${igblast_vdjtype}/${igblast_species}/${igblast_species}_${igblast_vdjtype}_D \
    -germline_db_J ${IGDATA}/${igblast_vdjtype}/${igblast_species}/${igblast_species}_${igblast_vdjtype}_J \
    -c_region_db ${IGDATA}/${igblast_vdjtype}/${igblast_species}/${igblast_species}_${igblast_vdjtype}_C \
    -auxiliary_data ${IGDATA}/optional_file/${igblast_species}_gl.aux \
    -organism ${igblast_species} \
    -query ${contig_output_name}.fasta \
    > igblast_result.tsv

# Perform a few conversions on the TSV
# When adding new columns, start from the right.
# The new columns added will increase the field count.
# The first column gets expanded out to 4 columns, but
# there are a few additional columns that add the length
# of the string to the left.
# Use the BEGIN block to modify the header.
# The rest of the file is modified in the main block.
#
# The cut command removes the 15th column (complete_vdj).
# This column is produced in check_fulllengthness() in VDJ_Compile_Results.py,
# that function however doesn't produce the complete_vdj column in the same
# manner as AIRR, so if we want to produce that column in the same manner,
# then remove the cut command from here and remove the aforementioned function
# from VDJ_Compile_Results.py.
awk 'BEGIN {
  FS = OFS = "\t"
  getline;
  $18=$18"\tsequence_alignment_aa_length";
  $16=$16"\tsequence_alignment_length";
  $3=$3"\tsequence_aa_length";
  $2=$2"\tsequence_length";
  $1="cell_index\tcontig_id\tread_count\tmolecule_count";
  print
} {
  $18=$18"\t"length($18);
  $16=$16"\t"length($16);
  $3=$3"\t"length($3);
  $2=$2"\t"length($2);
  gsub(/,/, "\t", $1);
  print
}' igblast_result.tsv | cut -d$'\t' -f15 --complement > ${contig_output_name}_igblast_result.tsv

# Convert the TSV into a CSV and then compress with gzip.
# For column 19 (c_call), if the value is '*', then just erase the value.
# For columns 16-29 (v_call, d_call, j_call, c_call), we only select the first item in the comma separated list.
awk 'BEGIN { FS="\t"; OFS="," }
{
  rebuilt=0
  for(i=1; i<=NF; ++i) {
    if (i == 19 && $i == "*") {
      $i=""
    }
    if ((i == 16 || i == 17 || i == 18 || i == 19) && $i ~ /,/) {
      sub(/,.*/, "", $i)
    }
    if ($i ~ /,/ && $i !~ /^".*"$/) {
      gsub("\"", "\"\"", $i)
      $i = "\"" $i "\""
      rebuilt=1
    }
  }
  if (!rebuilt) { $1=$1 }
  print
}' ${contig_output_name}_igblast_result.tsv | gzip > ${contig_output_name}_pruned.csv.gz
