#!/bin/bash

if test $# != 1 ;then
  echo "Number of arguments is not 1.
    Usage: bash ./VDJ_Trim_Reads.sh <fastq file>
        fastq file: a fastq file that will undergo trimming
    " 1>&2
   exit
fi

in_fp=$1
out_fp=$(basename -- ${in_fp})
out_fp=${out_fp%%.*}.

echo "VDJ_Trim_Reads shell Input Fastq: "${in_fp} 1>&2
echo "VDJ_Trim_Reads shell Output basename: "${out_fp} 1>&2

# We need about somewhere between 5 and 11 file handles
# per thread for cutadapt, so we'll first compute approximately
# how many we need as a multiple of 16. If the number of allowed
# open file handles is less than the desired, then we'll
# adjust the number of threads.
threads=`nproc`
echo "Initialized threads to $threads" 1>&2
req_file_ulimit=$((`nproc` * 16))
if [ `ulimit -n` -lt $req_file_ulimit ];
then
    threads=$((`ulimit -n` / 16))
    echo "File handle limit not high enough for the desired number of threads $(nproc)." \
	"Reducing the number of threads to $threads. To update the number of allowed file" \
	"handles, use ulimit -n [file_handle_limit]." 1>&2
fi

# Output multiple pipe delimited text files - split based on the last digit of the cell label
# CCCAAAAAAAA                    => OligodT Reverse Complement
# TATGCGTAGTAGGTATG              => TSO1
# CATACCTACTACGCATA              => TSO1 Reverse Complement - cutting 2 bases off the full sequence gives better trimming
# GTGGAGTCGTGATTATA              => TSO2
# TATAATCACGACTCCAC              => TSO2 Reverse Complement - cutting 2 bases off the full sequence gives better trimming
# ACAGGAAACTCATGGTGCGT           => TCR/BCR 5' Primer
# ACGCACCATGAGTTTCCTGT           => TCR/BCR 5' Primer Reverse Complement
# AAGCAGTGGTATCAACGCAGAGTACATGGG => PacBio Kinnex long read handle
echo "Running cutadapt" 1>&2
cutadapt -n 4 -e 0.15 \
    -a "CCCAAAAAAAA;min_overlap=11" \
    -g "TATGCGTAGTAGGTATG;min_overlap=10" \
    -a "CATACCTACTACGCA;min_overlap=7" \
    -g "GTGGAGTCGTGATTATA;min_overlap=10" \
    -a "TATAATCACGACTCC;min_overlap=7" \
    -a "ACAGGAAACTCATGGTGCGT;min_overlap=10" \
    -g "ACGCACCATGAGTTTCCTGT;min_overlap=10" \
    -g "AAGCAGTGGTATCAACGCAGAGTACATGGG;min_overlap=24" \
    -j "$threads" \
    -m 25 --quality-cutoff 20 ${in_fp} 2> cutadapt.log | \
    awk '(NR-3)%4!=0' | paste -d'|' - - - | \
    sed 's/^.//g' | \
    awk -v out_fp="$out_fp" -F "|" '{print > out_fp substr($1, length($1), 1) ".vdjtxt"}'

# Explanation
##awk '(NR-3)%4!=0' | paste -d'|' - - - | \  # combine header, seq, and quality on one line, delimited by '|' (Skipping the useless third line of the read in fastq)
##sed 's/^.//g' | \                          # remove the first character '@'
##awk -v out_fp="$out_fp" -F "|" '{print > out_fp substr($1, length($1), 1) ".vdjtxt"}'   # Delimiter is '|'  Write out entire line to file: basename + last digit of cell label ($1) + .vdjtxt

echo "cutadapt has finished, searching for *.vdjtxt files" 1>&2
if ls *.vdjtxt 1> /dev/null 2>&1; then
    echo "Compressing results" 1>&2
    pigz *.vdjtxt
fi


