final touches

Marcelo Ponce [2019-03-11 19:44:24]

final touches

Filename
core/comb_tables.sh~
core/countReads-wFns.sh
core/intergenic/det-interGenes-ORIG.sh

diff --git a/core/comb_tables.sh~ b/core/comb_tables.sh~
deleted file mode 100644
index 840d4f4..0000000
--- a/core/comb_tables.sh~
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/bin/bash
-###-xv
-
-# requires 3 arguments:
-# 1) INPUTfile (eg. BD1_INPUT)
-# 1) IPfile (eg. BD1_IP)
-# 1) T_THERM file (eg. T_thermophila_June2014.gff3)
-INPUTfile=`basename $1 .fastq.gz`
-IPfile=`basename $2 .fastq.gz`
-TTHERMfile=$3
-
-tableINPUTs=tableReadsINPUT.$INPUTfile
-
-tableIPs=tableReadsIP.$IPfile
-
-
-bigTABLE=TABLEE
-#normalizedTABLE=normalized.table.$IPfile-$INPUTfile
-#finalTABLE=FINAL.table.$IPfile-$INPUTfile
-IxFILES=$INPUTfile-$IPfile
-normalizedTABLE=normalized.table.$IxFILES
-finalTABLE=FINAL.table.$IxFILES
-
-# combine tables generated by "table.sh" and "alejandro.sh"
-paste table.$TTHERMfile $tableINPUTs $tableIPs > $bigTABLE
-
-# compute normalized quantities
-MIN=$(sort -n -k 4 $bigTABLE  | head -1 | awk '{print $4}')
-
-# find how many genes have MIN gene size
-MINgenes=$(awk -v min="$MIN" '{ if($4==min) print $0}' $bigTABLE  | wc -l)
-echo "Minimum gene size found: "$MIN
-echo "Number of genes with MIN gene size "$MINgenes
-
-# $5: nbr reads for INPUT, $6: nbr of reads for IP; $4: gene size
-##awk -v min="$MIN" '{print $5*min/$4" "$6*min/$4}' $bigTABLE > $normalizedTABLE
-##awk '{print $5*150./$4"\t"$6*150./$4}' TABLEE > normalizedTABLE
-#paste $bigTABLE $normalizedTABLE > $FINALtable
-### this calculation was doing a previous normalization using the MIN.gene.size
-### which is obsolete and NOT needed anymore...
-### awk -v min="$MIN" '{print $0 $5*min/$4"\t"$6*min/$4}' $bigTABLE
-
-
-#awk '{if ($7==0)  print "--"; else print $8/$7}' FINALtable  > scores
-#paste $TABLEE $normalizedTABLE | awk '{if ($7==0)  print "--"; else print $8/$7}' > $scores
-#paste FINALtable scores | sort -k 9 -n  > SUPERfinal
-#paste $TABLEE $normalizedTABLE | awk '{if ($7==0)  print "--"; else print $8/$7}' | sort -k 9 -n
-
-awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0)  print $0"\t""--"; else print $0"\t"$8/$7}'   >  $finalTABLE--SORTED
-awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0)  print $0"\t""--"; else print $0"\t"$8/$7}' | sort -k 9 -n   >  $finalTABLE
-
-
-### ADD headers to tables...
-#header="location \t name \t description \t GeneSize \t INPUT.reads \t IP.reads \t norm.INPUTs \t norm.IPs \t IP-INPUT.ratio"
-header="location \t name \t description \t GeneSize \t ${1}_reads \t ${1}_reads"
-for table in {$finalTABLE--SORTED,$finalTABLE}; do
-	echo -e $header | cat - $table > tmp && mv tmp $table
-done
-
-
-####
-# for generating table to use with intergenic regions script:
-#paste FINAL.table.BD2_IP_NGS-BD2_INPUT_NGS--UNSORTED /scratch2//s/scinet/mponce/Tetrahymena_Ryerson/1stAttempt/FINAL.table.BD1_IP_NGS-BD1_INPUT_NGS--UNSORTED | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"(($9+$18)*.5) }'  > combinedTABLES_BD1-BD2
diff --git a/core/countReads-wFns.sh b/core/countReads-wFns.sh
deleted file mode 100755
index b32cad1..0000000
--- a/core/countReads-wFns.sh
+++ /dev/null
@@ -1,254 +0,0 @@
-#!/bin/bash
-# -vx
-
-# main script used to analize and call (count) peaks in the T.Thermophila data.
-# This script also calls 2 other scripts: "table.sh" and "comb_tables.sh"
-# The script requires BWA and samtools to be installed as they are used within the pipeline.
-
-#################################
-### how to use it
-#
-# 1st argument: file with INPUT reads
-# 2nd argument: file with IP reads
-# 3rd argument: reference genome file (fasta)
-# 4th argument: annotiation file (gff3)
-# 5th argument: working space (if possible use RAMdisk --ie. /dev/shm/--, or /tmp in a computer with SSD)
-# 6th argument (optional): number of cores to use for BWA multi-threading
-#
-# Examples:
-# time PATHtoRACSrepo/core/countPeaks.sh   _1_MED1_INPUT_S25_L007_R1_001  _3_MED1_IP_S27_L007_R1_001  T_thermophila_June2014_assembly.fasta  T_thermophila_June2014.gff3  /tmp/  16
-# time PATHtoRACSrepo/core/countPeaks.sh   _1_MED1_INPUT_S25_L007_R1_001  _3_MED1_IP_S27_L007_R1_001  T_thermophila_June2014_assembly.fasta  T_thermophila_June2014.gff3  /dev/shm/  16
-#
-#################################
-
-
-### functions #########################################
-LOChelpMsg(){
-	echo "How to use this script:"
-	cancel;
-}
-#
-#
-LOCcheckTools(){
-	BWA=$(which bwa);
-	[ -z $BWA ] && errMsg "BWA needs to be installed for using this pipeline!"
-	SAMTOOLS=$(which samtools)
-	[ -z $SAMTOOLS ] && errMsg "SAMTOOLS needs to be installed in the system, as it is required in the pipeline!"
-}
-
-#####
-
-LOCcheckIntegrityPipeline(){
-	scriptsDIR=`dirname $0`
-	[ -z $scriptsDIR/table.sh ]  && errMsg "missing 'table.sh' from pipeline directory $scriptsDIR"
-	[ -z $scriptsDIR/comb_tables.sh ] && errMsg "mssing 'comb_tables.sh' from pipeline directory $scriptsDIR"
-}
-
-#
-#######################################################
-# setting preamble, detecting scripts location
-scriptsDIR=`dirname $0`
-
-# load auxiliary fns for integrity checks and message/error handling
-if [[ -f $scriptsDIR/auxs/auxFns.sh ]]; then
-        . $scriptsDIR/auxs/auxFns.sh --source-only;
-else
-        echo "Error auxiliary file: " $scriptsDIR/auxs/auxFns.sh "NOT found!"
-        exit
-fi
-#######################################################
-
-#### CHECKS ###########################################
-welcome
-
-### CHECK arguments
-if [[ $# -eq 0 ]]; then
-	errMsg "No arguments supplied!";
-fi
-#
-if [[ $# -lt 5 ]]; then
-	errMsg "Five mandatory arguments are needed!";
-fi
-
-### CHECK software
-checkIntegrityPipeline table.sh comb_tables.sh
-checkTools  bwa samtools
-######################################################
-
-# ====================================================
-# DEFINE SOME env.VARIABLES
-#dataDIR=DATA
-#dataDIR=data2
-#[ -d $1 ] && dataDIR=$1 || errMsg "1st argument should be a directory, where the reference data to be used is located"
-
-## reference file
-#fastaFILE=$dataDIR/T_thermophila_June2014_assembly.fasta	# <==== this could be another argument!!!
-#
-#faiFILE=$fastaFILE.fai
-
-## target files
-## TO BE READ as arguments... $1 (INPUT) and $2 (IP)
-[ -s $1 ] &&  INPUTfile=`basename $1` || errMsg "2nd argument should be a file with the INPUT reads"
-[ -s $2 ] &&  IPfile=`basename $2` || errMsg "3rd argument should be a file with the IP reads"
-[ -s $3 ] &&  FASTAfile=`basename $3` || errMsg "4th argument should be a file with the reference genome file (fasta)"
-[ -s $4 ] &&  REFfile=`basename $4` || errMsg "5th argument should be an annotation file (gff3)"
-
-faiFILE=$FASTAfile
-fastaFILE=$FASTAfile
-
-inputFILE1=INPUTfile
-inputFILE2=IPfile
-
-# could be RAMdisk or TEMP in a SSD
-# use RAMDISK
-#RAMdisk=/dev/shm/$USER/
-#RAMdisk=/tmp/$USER/
-RunTimeDir=ORF_RACS-$$-`date '+%Y%m%d-%H%M%S'`
-[ -d $5 ] && WORKINGdir=$5/$USER/$RunTimeDir || errMsg "6th argument must be a directory, we suggest using /dev/shm for RAMdisk or /tmp in a SSD device, as this pipeline is quite I/O intense!"
-echo "Working directory --> " $WORKINGdir
-
-
-# number of threads to use with BWA
-cores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l)
-#[ -z $6 ] && NT=$6 || NT=$cores
-if [ " "$6 != " " ]; then NT=$6 ; else NT=$cores; fi
-echo "BWA will use nbrCores="$NT", when possible"
-
-## =====
-
-#inputFILE1=$INPUTfile	#.fastq.gz
-#[ -s $inputFILE1 ] || errMsg $inputFILE1" not found"
-#inputFILE2=$IPfile	#.fastq.gz
-#[ -s $inputFILE2 ] || errMsg $inputFILE2" not found"
-
-# save original directory
-myDIR=$(pwd)
-resultsDIR=$myDIR/ORF_RACS_results-`date '+%Y%m%d-%H%M%S'`	#`date +%D-%T`
-scriptsDIR=`dirname $0`
-
-
-# use RAMDISK
-#RAMdisk=/dev/shm/$USER/
-#RAMdisk=/tmp/$USER/
-
-echo # create local space on RAMDISK
-mkdir -p $WORKINGdir
-# copy data into WORKINGdir
-#cp -pr ./DATA  $RAMdisk
-cp  -prv $1 $WORKINGdir/$inputFILE1
-cp  -prv $2 $WORKINGdir/$inputFILE2
-cp  -prv $3 $WORKINGdir/$FASTAfile
-cp  -prv $4 $WORKINGdir/$REFfile
-# move to WORKINGdir
-cd $WORKINGdir
-
-
-# outputs
-outputINPUTbam=aln$INPUTfile.bam
-outputINPUTbamSORTED=aln$INPUTfile-sorted.bam
-outputINPUTsam=aln$INPUTfile.sam
-#outputIPbam=alnBD1_IP.bam
-#outputIPbamSORTED=alnBD1_IP-sorted.bam
-#outputIPsam=alnBD1_IP.sam
-outputIPbam=aln$IPfile.bam
-outputIPbamSORTED=aln$IPfile-sorted.bam
-outputIPsam=aln$IPfile.sam
-#output3gzip=/dev/shm/$USER/alnBD1_INPUT.tar.gz
-
-# tables files
-tableINPUTs=tableReadsINPUT.`basename $INPUTfile .fastq.gz`
-tableIPs=tableReadsIP.`basename $IPfile .fastq.gz`
-
-####=====================================================
-
-
-## pipeline
-# index the assembly
-# generates 4 aux. files (in principle could be removed!  -- *.fasta.*)
-$BWA index $fastaFILE
-#rm *.fasta.*
-
-# align the INPUT to assembly
-# bwa -> multithreaded... (upto 4)
-$BWA  aln -t $NT  $fastaFILE  $inputFILE1 >  $outputINPUTbam
-
-# convert to sequence coordinate for INPUT
-$BWA samse  $fastaFILE $outputINPUTbam $inputFILE1  >  $outputINPUTsam
-
-# remove inputFILE1 in case of memory issues
-#rm -v $inputFILE1
-
-#######
-# zip file in RAMDISK
-#tar cvzf $output3gzip $output2sam
-######
-
-# align the IP file to the assembly
-# bwa -> multithreaded... (upto 4)
-$BWA  aln -t $NT  $fastaFILE  $inputFILE2  >  $outputIPbam
-
-# convert to sequence coordinate for IP
-$BWA samse  $fastaFILE $outputIPbam $inputFILE2  >  $outputIPsam
-
-# remove inputFILE2 in case of oom
-#rm -v $inputFILE2
-
-
-# FAI index to the assembly
-$SAMTOOLS faidx $fastaFILE
-
-
-# align assembly witht the sam and bam files
-$SAMTOOLS import $faiFILE $outputINPUTsam $outputINPUTbam
-$SAMTOOLS import $faiFILE $outputIPsam $outputIPbam
-
-# sorting for IGV
-$SAMTOOLS sort $outputINPUTbam -o $outputINPUTbamSORTED
-$SAMTOOLS sort $outputIPbam -o $outputIPbamSORTED
-
-$SAMTOOLS index $outputINPUTbamSORTED
-$SAMTOOLS index $outputIPbamSORTED
-
-
-# define table file for extracting genes... passed as third argument to the script!!!
-table=table.$REFfile
-# extract the reads
-touch $tableINPUTs  #tableReadsINPUT
-# read scaffolds from table generated by the "table" script and loop over the sorted BAM files
-# NEEDS the file "table" generated by the script 'table.sh' // if it doesn't exist, launch its creation in parallel
-### [ ! -e $resultsDIR/$table ] || $scriptsDIR/table.sh $REFfile && cp -v $resultsDIR/$table .
-$scriptsDIR/table.sh $REFfile
-[ $? -ne 0 ] && errMsg "' $scriptsDIR/table.sh $REFfile' FAILED...! exitcode $?"
-
-for i in `awk '{print $1}' < $table `; do
-    #echo $i;
-    $SAMTOOLS view $outputINPUTbamSORTED $i | wc -l >> $tableINPUTs  #tableReadsINPUT
-done
-
-touch $tableIPs   #tableReadsIP
-for i in `awk '{print $1}' < $table `; do
-    #echo $i;
-    $SAMTOOLS view $outputIPbamSORTED $i | wc -l >> $tableIPs  #tableReadsIP
-done
-
-
-#remove temporal files...
-rm -v *.fasta.*
-
-### copy data into original directory
-mkdir -p $resultsDIR
-cp $outputINPUTbamSORTED $outputIPbamSORTED  $resultsDIR
-cp $outputINPUTsam $outputIPsam  $resultsDIR
-cp $tableINPUTs  $resultsDIR
-cp $tableIPs  $resultsDIR
-# need the index files for intergenic regions determination
-cp *bai $resultsDIR
-
-# FINAL step, combine tables together...
-#. $scriptsDIR/comb_tables.sh _2_MED2_INPUT_S26_L007_R1_001 _4_MED2_IP_S28_L007_R1_001 T_thermophila_June2014.gff3
-$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile
-[ $? -ne 0 ] && errMsg "'$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile' FAILED...! exitcode $?"
-cp -v FINAL.table.*  $resultsDIR
-
-# clean temp dir used
-rm -rfv $WORKINGdir
diff --git a/core/intergenic/det-interGenes-ORIG.sh b/core/intergenic/det-interGenes-ORIG.sh
deleted file mode 100755
index 12d6123..0000000
--- a/core/intergenic/det-interGenes-ORIG.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-
-# det-interGenes.sh
-# main script in charge of determining the intergenic regions
-# this part of the pipeline requires the table generated by the ORF reads/counts
-#
-#
-# HOW TO USE THIS SCRIPT:
-#	arg1: final combined table gnerated by the ORF from the RACS pipeline
-#	arg2: reference genome file (gff3)
-#	arg3: name of the file where to save the InterGenic Regions
-#	arg4/5:	name of the output file from ORF part/tag.(ref.) name
-#	arg6/7: ...
-#	...
-#
-# eg.
-#    det-interGenes.sh  combinedTABLES_MED1-MED2  dataset/T_thermophila_June2014.sorted.gff3  interGENs_MED1-MED2.csv
-#				Fillingham_1_MED1_INPUT_S25_L007_R1_001  INPUTm1	\
-#				Fillingham_2_MED2_INPUT_S26_L007_R1_001  INPUTm2	\
-#				Fillingham_3_MED1_IP_S27_L007_R1_001  IPm1	\
-#				Fillingham_4_MED2_IP_S28_L007_R1_001  IPm2	\
-#				...
-#
-
-#################################################
-# Setting preamble, detecting scripts location
-scriptsDIR=`dirname $0`
-
-#################################################
-# load auxiliary fns for integrity checks and message/error handling
-. $scriptsDIR/../auxs/auxFns.sh --source-only
-
-#### CHECKS #####################################
-### CHECK arguments
-if [[ $# -eq 0 ]]; then
-	errMsg "No arguments were supplied!";
-fi
-#
-
-### INTEGRITY CHECK
-# check RACS scripts
-checkIntegrityPipeline  intergenic/det-interGenes.sh  intergenic/interGeneRegions.R  intergenic/utils_RACS-IGR.R  intergenic/interGenes.sh
-# check external tools needed
-checkTools samtools Rscript
-#################################################
-
-# getting command line arguments
-
-combTABLE=$1    	#combinedTABLES_MED1-MED2
-refFILE=$2		#dataset/T_thermophila_June2014.sorted.gff3
-interGENregions=$3      #interGENs_MED1-MED2.csv  -- output from Rscript
-#
-INPUTfile1=$4	#Fillingham_1_MED1_INPUT_S25_L007_R1_001
-tag1=$5		#INPUTm1
-INPUTfile2=$6	#Fillingham_2_MED2_INPUT_S26_L007_R1_001
-tag2=$7		#INPUTm2
-INPUTfile3=$8	#Fillingham_3_MED1_IP_S27_L007_R1_001
-tag3=$9		#IPm1
-INPUTfile4=${10}	#Fillingham_4_MED2_IP_S28_L007_R1_001
-tag4=${11}	#IPm2
-#
-combinedFinalIGenicTable=${interGENregions}_INPUTs-IPs`date '+%Y%m%d-%H%M%S'`.csv
-# ================================================
-
-# step #0: look into combTABLE and generate list of intergenic regions
-t0=$(time Rscript  $scriptsDIR/interGeneRegions.R   $combTABLE  $refFILE  $interGENregions)
-
-# process each of the samples in comparison to the intergenic regions just generated in 'step #0'
-t1=$(time  $scriptsDIR/interGenes.sh  $tag1  $INPUTfile1  $interGENregions)
-#t1=$(time . $scriptsDIR/interGenes.sh  INPUTm1 Fillingham_1_MED1_INPUT_S25_L007_R1_001  interGENs_MED1-MED2.csv )
-t2=$(time  $scriptsDIR/interGenes.sh  $tag2  $INPUTfile2  $interGENregions)
-#t2=$(time . $scriptsDIR/interGenes.sh  INPUTm2 Fillingham_2_MED2_INPUT_S26_L007_R1_001  interGENs_MED1-MED2.csv )
-t3=$(time  $scriptsDIR/interGenes.sh  $tag3  $INPUTfile3  $interGENregions)
-#t3=$(time . $scriptsDIR/interGenes.sh  IPm1 Fillingham_3_MED1_IP_S27_L007_R1_001  interGENs_MED1-MED2.csv )
-t4=$(time  $scriptsDIR/interGenes.sh  $tag4  $INPUTfile4  $interGENregions)
-#t4=$(time . $scriptsDIR/interGenes.sh  IPm2 Fillingham_4_MED2_IP_S28_L007_R1_001  interGENs_MED1-MED2.csv )
-
-# combine tables with INPUT/IP reads
-#t5=$(time paste interGENs_MED1-MED2.csv interGENs-Fillingham_1_MED1_INPUT_S25_L007_R1_001 interGENs-Fillingham_3_MED1_IP_S27_L007_R1_001 interGENs-Fillingham_2_MED2_INPUT_S26_L007_R1_001 interGENs-Fillingham_4_MED2_IP_S28_L007_R1_001 > interGENs_MED1-MED2_INPUTs-IPs.csv )
-t5=$(time paste  $interGENregions  interGENs-$INPUTfile1  interGENs-$INPUTfile3  interGENs-$INPUTfile2  interGENs-$INPUTfile4  >  $combinedFinalIGenicTable)
-
-# statistics on running times
-echo "Partial Times: $t0, $t1, $t2, $t3, $t4"
-echo Total time: $t0+($t1+$t2+$t3+$t4)+$t5
-
-

ViewGit