changing exit --> return to avoid killing the terminal on an error exit msg
changing exit --> return to avoid killing the terminal on an error exit msg
#!/bin/bash
# -vx
# RACS ORF pipeline
# main script used to analize and call (count) peaks in the T.Thermophila data.
# This script also calls 2 other scripts: "table.sh" and "comb_tables.sh"
# The script requires BWA and samtools to be installed as they are used within the pipeline.
#################
### how to use it
#
# 1st argument: file with INPUT reads (.fastq.gz)
# 2nd argument: file with IP reads (.fastq.gz)
# 3rd argument: reference genome file (.fasta)
# 4th argument: annotiation file (.gff3)
# 5th argument: working space (if possible use RAMdisk --ie. /dev/shm/--, or /tmp in a computer with SSD)
# 6th argument (optional): number of cores to use for BWA multi-threading
# 7th argument (optional): file with definitions for processing organisms other than the Tetrahymena Thermophila, or for looking at other keys/terms
#
#
# Examples:
# PATHtoRACSrepo/core/countReads.sh pathTOdata/_INPUT_file.fastq.gz pathTOdata/_IP_file.fastq.gz pathTOrefs/T_thermophila_June2014_assembly.fasta pathTOrefs/T_thermophila_June2014.gff3 /tmp/ 16
# PATHtoRACSrepo/core/countReads.sh pathTOdata/_INPUT_file.fastq.gz pathTOdata/_IP_file.fastq.gz pathTOrefs/T_thermophila_June2014_assembly.fasta pathTOrefs/T_thermophila_June2014.gff3 /dev/shm/ 16
# PATHtoRACSrepo/core/countReads.sh pathTOdata/_INPUT_file.fastq.gz pathTOdata/_IP_file.fastq.gz pathTOrefs/T_thermophila_June2014_assembly.fasta pathTOrefs/T_thermophila_June2014.gff3 /dev/shm/ "" pathTOdefns/myDefns.id
#
#
# More examples are available in the README file of the RACS repository
#
#################################
### TIMERS... #################
startTime=`date +%s`
###############################
### LOG function ##############
logging() {
# set log file
LOG_FILE=${myDIR}/"racs_${RunTimeDir}.log"
# open log file
touch ${LOG_FILE}
# CAPTURING standard output/error and still displaying in 'screen' (std.output)
# capture std.output
#exec 1> ${LOG_FILE}
exec 1> >(tee -a ${LOG_FILE} >&1)
# capture std.error
exec 2>&1
# capture std. error and also mirror it to screen
#exec 1> ${LOG_FILE} 2> >(tee -a ${LOG_FILE} >&1)
exec 2> >(tee -a ${LOG_FILE} >&1)
# other ways...
# >&3 | tee /dev/tty
###{ { echo out; echo err 1>&2; } 2>&1 >&3 | tee /dev/tty; } >${LOG_FILE} 3>&1
#exec 1> ${LOG_FILE} 2> >(tee -a ${LOG_FILE} >&2)
###
# display welcome line about RACS
welcome
echo ""
# include original issued command passed as an argument to this fn
echo " cmd>>> " $@
echo ""
# print date
date
echo ".................."
}
################################
#######################################################
# check that the script is not being sourced!!!
if [[ $0 == '-bash' ]]; then
echo "Please do not 'source' this script; ie. run it as PATHtoRACS/core/SCRIPTname arguments"
stop
fi
# setting preamble, detecting scripts location
scriptDIR=`dirname $0`
# get the absolute path of RACS...
scriptsDIR=$( cd "${scriptDIR}" && pwd )
echo "RACS location --> ${scriptsDIR}"
# load auxiliary fns for integrity checks and message/error handling
if [[ -f $scriptsDIR/auxs/auxFns.sh ]]; then
. $scriptsDIR/auxs/auxFns.sh --source-only;
else
echo "Error auxiliary file: " $scriptsDIR/auxs/auxFns.sh "NOT found!"
return
fi
#######################################################
#### CHECKS ###########################################
welcome
### CHECK arguments
if [[ $# -eq 0 ]]; then
errMsg "No arguments supplied!";
fi
#
if [[ $# -lt 5 ]]; then
errMsg "Five mandatory arguments are needed!";
fi
### CHECK software
checkIntegrityPipeline table.sh comb_tables.sh
checkTools bwa samtools
######################################################
# define generic names for required programs...
BWA=$(which bwa)
SAMTOOLS=$(which samtools)
# ====================================================
# DEFINE SOME env.VARIABLES
#dataDIR=DATA
#dataDIR=data2
#[ -d $1 ] && dataDIR=$1 || errMsg "1st argument should be a directory, where the reference data to be used is located"
## reference file
#fastaFILE=$dataDIR/T_thermophila_June2014_assembly.fasta # <==== this could be another argument!!!
#
#faiFILE=$fastaFILE.fai
## target files
## TO BE READ as arguments... $1 (INPUT) and $2 (IP)
[ -s $1 ] && INPUTfile=`basename $1` || errMsg "1st argument should be a file with the INPUT reads (fastq.gz)"
[ -s $2 ] && IPfile=`basename $2` || errMsg "2nd argument should be a file with the IP reads (fastq.gz)"
[ -s $3 ] && FASTAfile=`basename $3` || errMsg "3rd argument should be a file with the reference genome file (fasta)"
[ -s $4 ] && REFfile=`basename $4` || errMsg "4th argument should be an annotation file (gff3)"
faiFILE=$FASTAfile
fastaFILE=$FASTAfile
inputFILE1=INPUTfile
inputFILE2=IPfile
# could be RAMdisk or TEMP in a SSD
# use RAMDISK
#RAMdisk=/dev/shm/$USER/
#RAMdisk=/tmp/$USER/
RunTimeDir=ORF_RACS-$$-`date '+%Y%m%d-%H%M%S'`
[ -d $5 ] && WORKINGdir=$( cd "$5" && pwd )/$USER/$RunTimeDir || errMsg "6th argument must be a directory, we suggest using /dev/shm for RAMdisk or /tmp in a SSD device, as this pipeline is quite I/O intense!"
echo "Working directory --> " $WORKINGdir
# number of threads to use with BWA
#cores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l)
cores=`detectCores`
#[ -z $6 ] && NT=$6 || NT=$cores
if [ " "$6 != " " ]; then NT=$6 ; else NT=$cores; fi
echo "BWA will use nbrCores="$NT", when possible"
## =====
#inputFILE1=$INPUTfile #.fastq.gz
#[ -s $inputFILE1 ] || errMsg $inputFILE1" not found"
#inputFILE2=$IPfile #.fastq.gz
#[ -s $inputFILE2 ] || errMsg $inputFILE2" not found"
# save original directory
myDIR=$(pwd)
resultsDIR=$myDIR/ORF_RACS_results-`date '+%Y%m%d-%H%M%S'` #`date +%D-%T`
## -.-.-.-
ORGANISM=${7:-"${scriptsDIR}/defns/TT_gene.id"}
echo "==>>> organism specified by '${ORGANISM}' to process..."
checkFile $ORGANISM
ORGfile=`basename $ORGANISM`
## -.-.-.-
# use RAMDISK
#RAMdisk=/dev/shm/$USER/
#RAMdisk=/tmp/$USER/
echo # create local space on RAMDISK
mkdir -p $WORKINGdir
# copy data into WORKINGdir
#cp -pr ./DATA $RAMdisk
cp -prv $1 $WORKINGdir/$inputFILE1
cp -prv $2 $WORKINGdir/$inputFILE2
cp -prv $3 $WORKINGdir/$FASTAfile
cp -prv $4 $WORKINGdir/$REFfile
cp -prv $ORGANISM $WORKINGdir/$ORGfile
# move to WORKINGdir
cd $WORKINGdir
# outputs
outputINPUTbam=aln$INPUTfile.bam
outputINPUTbamSORTED=aln$INPUTfile-sorted.bam
outputINPUTsam=aln$INPUTfile.sam
#outputIPbam=alnBD1_IP.bam
#outputIPbamSORTED=alnBD1_IP-sorted.bam
#outputIPsam=alnBD1_IP.sam
outputIPbam=aln$IPfile.bam
outputIPbamSORTED=aln$IPfile-sorted.bam
outputIPsam=aln$IPfile.sam
#output3gzip=/dev/shm/$USER/alnBD1_INPUT.tar.gz
# tables files
tableINPUTs=tableReadsINPUT.`basename $INPUTfile .fastq.gz`
tableIPs=tableReadsIP.`basename $IPfile .fastq.gz`
####=====================================================
# begin outputing to LOG file...
logging $0$@
## pipeline
# index the assembly
# generates 4 aux. files (in principle could be removed! -- *.fasta.*)
$BWA index $fastaFILE
#rm *.fasta.*
# align the INPUT to assembly
# bwa -> multithreaded... (upto 4)
$BWA aln -t $NT $fastaFILE $inputFILE1 > $outputINPUTbam
# convert to sequence coordinate for INPUT
$BWA samse $fastaFILE $outputINPUTbam $inputFILE1 > $outputINPUTsam
# remove inputFILE1 in case of memory issues
#rm -v $inputFILE1
#######
# zip file in RAMDISK
#tar cvzf $output3gzip $output2sam
######
# align the IP file to the assembly
# bwa -> multithreaded... (upto 4)
$BWA aln -t $NT $fastaFILE $inputFILE2 > $outputIPbam
# convert to sequence coordinate for IP
$BWA samse $fastaFILE $outputIPbam $inputFILE2 > $outputIPsam
# remove inputFILE2 in case of oom
#rm -v $inputFILE2
# FAI index to the assembly
$SAMTOOLS faidx $fastaFILE
# align assembly witht the sam and bam files
$SAMTOOLS import $faiFILE $outputINPUTsam $outputINPUTbam
$SAMTOOLS import $faiFILE $outputIPsam $outputIPbam
# sorting for IGV
$SAMTOOLS sort $outputINPUTbam -o $outputINPUTbamSORTED
$SAMTOOLS sort $outputIPbam -o $outputIPbamSORTED
$SAMTOOLS index $outputINPUTbamSORTED
$SAMTOOLS index $outputIPbamSORTED
# define table file for extracting genes... passed as third argument to the script!!!
table=table.$REFfile
# extract the reads
touch $tableINPUTs #tableReadsINPUT
# read scaffolds from table generated by the "table" script and loop over the sorted BAM files
# NEEDS the file "table" generated by the script 'table.sh' // if it doesn't exist, launch its creation in parallel
### [ ! -e $resultsDIR/$table ] || $scriptsDIR/table.sh $REFfile && cp -v $resultsDIR/$table .
cmdTable="$scriptsDIR/table.sh $REFfile $ORGfile"
${cmdTable}
#???#### [ $? -ne 0 ] && errMsg "' ${cmdTable} ' FAILED...! exitcode $?"
for i in `awk '{print $1}' < $table `; do
#echo $i;
$SAMTOOLS view $outputINPUTbamSORTED $i | wc -l >> $tableINPUTs #tableReadsINPUT
done
touch $tableIPs #tableReadsIP
for i in `awk '{print $1}' < $table `; do
#echo $i;
$SAMTOOLS view $outputIPbamSORTED $i | wc -l >> $tableIPs #tableReadsIP
done
#remove temporal files...
rm -v *.fasta.*
### copy data into original directory
mkdir -p $resultsDIR
cp $outputINPUTbamSORTED $outputIPbamSORTED $resultsDIR
cp $outputINPUTsam $outputIPsam $resultsDIR
cp $tableINPUTs $resultsDIR
cp $tableIPs $resultsDIR
# need the index files for intergenic regions determination
cp *bai $resultsDIR
# FINAL step, combine tables together...
#. $scriptsDIR/comb_tables.sh _2_MED2_INPUT_S26_L007_R1_001 _4_MED2_IP_S28_L007_R1_001 T_thermophila_June2014.gff3
$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile
[ $? -ne 0 ] && errMsg "'$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile' FAILED...! exitcode $?"
cp -v FINAL.table.* $resultsDIR
# clean temp dir used
rm -rfv $WORKINGdir
### TIMERS... #################
endTime=`date +%s`
runtime=$((endTime-startTime))
echo "-----------------------------"
echo "Total runtime: $runtime secs"
echo "-----------------------------"
################################