Last commit for core/comb_tables.sh: 84d742d3ed3a848711ec3056e40f36c653809433

adding echoes to identifiy in whcih part of the pipeline is being entered...

Marcelo Ponce [2019-07-01 18:23:21]

adding echoes to identifiy in whcih part of the pipeline is being entered...

#!/bin/bash
###-xv

# comb_tables.sh 	---	RACS ORF pipeline
#
# auxiliary internal script to RACS ORF pipeline, called from countReads.sh
#
# requires 3 arguments:
# 1) INPUTfile (eg. BD1_INPUT)
# 1) IPfile (eg. BD1_IP)
# 1) T_THERM file (eg. T_thermophila_June2014.gff3)
INPUTfile=`basename $1 .fastq.gz`
IPfile=`basename $2 .fastq.gz`
TTHERMfile=$3

tableINPUTs=tableReadsINPUT.$INPUTfile

tableIPs=tableReadsIP.$IPfile

bigTABLE=TABLEE
#normalizedTABLE=normalized.table.$IPfile-$INPUTfile
#finalTABLE=FINAL.table.$IPfile-$INPUTfile
IxFILES=$INPUTfile-$IPfile
normalizedTABLE=normalized.table.$IxFILES
finalTABLE=FINAL.table.$IxFILES


####
echo ">>> entering $0 - RACS ORF..."
####

# combine tables generated by "table.sh" and "alejandro.sh"
paste table.$TTHERMfile $tableINPUTs $tableIPs > $bigTABLE

# compute normalized quantities
MIN=$(sort -n -k 4 $bigTABLE  | head -1 | awk '{print $4}')

# find how many genes have MIN gene size
MINgenes=$(awk -v min="$MIN" '{ if($4==min) print $0}' $bigTABLE  | wc -l)
echo "Minimum gene size found: "$MIN
echo "Number of genes with MIN gene size "$MINgenes

# $5: nbr reads for INPUT, $6: nbr of reads for IP; $4: gene size
##awk -v min="$MIN" '{print $5*min/$4" "$6*min/$4}' $bigTABLE > $normalizedTABLE
##awk '{print $5*150./$4"\t"$6*150./$4}' TABLEE > normalizedTABLE
#paste $bigTABLE $normalizedTABLE > $FINALtable
### this calculation was doing a previous normalization using the MIN.gene.size
### which is obsolete and NOT needed anymore...
### awk -v min="$MIN" '{print $0 $5*min/$4"\t"$6*min/$4}' $bigTABLE


#awk '{if ($7==0)  print "--"; else print $8/$7}' FINALtable  > scores
#paste $TABLEE $normalizedTABLE | awk '{if ($7==0)  print "--"; else print $8/$7}' > $scores
#paste FINALtable scores | sort -k 9 -n  > SUPERfinal
#paste $TABLEE $normalizedTABLE | awk '{if ($7==0)  print "--"; else print $8/$7}' | sort -k 9 -n

### this calculation was doing a previous normalization using the MIN.gene.size
### which is obsolete and NOT needed anymore...
### awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0)  print $0"\t""--"; else print $0"\t"$8/$7}'   >  $finalTABLE--SORTED
# in principle, there is only one final table ordered by scaffold loc.
#cp  $bigTABLE  $finalTABLE--SORTED

### awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0)  print $0"\t""--"; else print $0"\t"$8/$7}' | sort -k 9 -n   >  $finalTABLE
mv $bigTABLE $finalTABLE

### ADD headers to tables...
#header="location \t name \t description \t GeneSize \t INPUT.reads \t IP.reads \t norm.INPUTs \t norm.IPs \t IP-INPUT.ratio"
header="location \t name \t description \t GeneSize \t ${1}_reads \t ${2}_reads"
#for table in {$finalTABLE--SORTED,$finalTABLE}; do
for table in $finalTABLE; do
	echo -e $header | cat - $table > tmp && mv tmp $table
done


####
# for generating table to use with intergenic regions script:
#paste FINAL.table.BD2_IP_NGS-BD2_INPUT_NGS--UNSORTED /scratch2//s/scinet/mponce/Tetrahymena_Ryerson/1stAttempt/FINAL.table.BD1_IP_NGS-BD1_INPUT_NGS--UNSORTED | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"(($9+$18)*.5) }'  > combinedTABLES_BD1-BD2

ViewGit