adding echoes to identifiy in whcih part of the pipeline is being entered...
adding echoes to identifiy in whcih part of the pipeline is being entered...
#!/bin/bash
###-xv
# comb_tables.sh --- RACS ORF pipeline
#
# auxiliary internal script to RACS ORF pipeline, called from countReads.sh
#
# requires 3 arguments:
# 1) INPUTfile (eg. BD1_INPUT)
# 1) IPfile (eg. BD1_IP)
# 1) T_THERM file (eg. T_thermophila_June2014.gff3)
INPUTfile=`basename $1 .fastq.gz`
IPfile=`basename $2 .fastq.gz`
TTHERMfile=$3
tableINPUTs=tableReadsINPUT.$INPUTfile
tableIPs=tableReadsIP.$IPfile
bigTABLE=TABLEE
#normalizedTABLE=normalized.table.$IPfile-$INPUTfile
#finalTABLE=FINAL.table.$IPfile-$INPUTfile
IxFILES=$INPUTfile-$IPfile
normalizedTABLE=normalized.table.$IxFILES
finalTABLE=FINAL.table.$IxFILES
####
echo ">>> entering $0 - RACS ORF..."
####
# combine tables generated by "table.sh" and "alejandro.sh"
paste table.$TTHERMfile $tableINPUTs $tableIPs > $bigTABLE
# compute normalized quantities
MIN=$(sort -n -k 4 $bigTABLE | head -1 | awk '{print $4}')
# find how many genes have MIN gene size
MINgenes=$(awk -v min="$MIN" '{ if($4==min) print $0}' $bigTABLE | wc -l)
echo "Minimum gene size found: "$MIN
echo "Number of genes with MIN gene size "$MINgenes
# $5: nbr reads for INPUT, $6: nbr of reads for IP; $4: gene size
##awk -v min="$MIN" '{print $5*min/$4" "$6*min/$4}' $bigTABLE > $normalizedTABLE
##awk '{print $5*150./$4"\t"$6*150./$4}' TABLEE > normalizedTABLE
#paste $bigTABLE $normalizedTABLE > $FINALtable
### this calculation was doing a previous normalization using the MIN.gene.size
### which is obsolete and NOT needed anymore...
### awk -v min="$MIN" '{print $0 $5*min/$4"\t"$6*min/$4}' $bigTABLE
#awk '{if ($7==0) print "--"; else print $8/$7}' FINALtable > scores
#paste $TABLEE $normalizedTABLE | awk '{if ($7==0) print "--"; else print $8/$7}' > $scores
#paste FINALtable scores | sort -k 9 -n > SUPERfinal
#paste $TABLEE $normalizedTABLE | awk '{if ($7==0) print "--"; else print $8/$7}' | sort -k 9 -n
### this calculation was doing a previous normalization using the MIN.gene.size
### which is obsolete and NOT needed anymore...
### awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0) print $0"\t""--"; else print $0"\t"$8/$7}' > $finalTABLE--SORTED
# in principle, there is only one final table ordered by scaffold loc.
#cp $bigTABLE $finalTABLE--SORTED
### awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0) print $0"\t""--"; else print $0"\t"$8/$7}' | sort -k 9 -n > $finalTABLE
mv $bigTABLE $finalTABLE
### ADD headers to tables...
#header="location \t name \t description \t GeneSize \t INPUT.reads \t IP.reads \t norm.INPUTs \t norm.IPs \t IP-INPUT.ratio"
header="location \t name \t description \t GeneSize \t ${1}_reads \t ${2}_reads"
#for table in {$finalTABLE--SORTED,$finalTABLE}; do
for table in $finalTABLE; do
echo -e $header | cat - $table > tmp && mv tmp $table
done
####
# for generating table to use with intergenic regions script:
#paste FINAL.table.BD2_IP_NGS-BD2_INPUT_NGS--UNSORTED /scratch2//s/scinet/mponce/Tetrahymena_Ryerson/1stAttempt/FINAL.table.BD1_IP_NGS-BD1_INPUT_NGS--UNSORTED | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"(($9+$18)*.5) }' > combinedTABLES_BD1-BD2