:78 Function create_function() is deprecated [8192]
diff --git a/core/comb_tables.sh~ b/core/comb_tables.sh~ deleted file mode 100644 index 840d4f4..0000000 --- a/core/comb_tables.sh~ +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -###-xv - -# requires 3 arguments: -# 1) INPUTfile (eg. BD1_INPUT) -# 1) IPfile (eg. BD1_IP) -# 1) T_THERM file (eg. T_thermophila_June2014.gff3) -INPUTfile=`basename $1 .fastq.gz` -IPfile=`basename $2 .fastq.gz` -TTHERMfile=$3 - -tableINPUTs=tableReadsINPUT.$INPUTfile - -tableIPs=tableReadsIP.$IPfile - - -bigTABLE=TABLEE -#normalizedTABLE=normalized.table.$IPfile-$INPUTfile -#finalTABLE=FINAL.table.$IPfile-$INPUTfile -IxFILES=$INPUTfile-$IPfile -normalizedTABLE=normalized.table.$IxFILES -finalTABLE=FINAL.table.$IxFILES - -# combine tables generated by "table.sh" and "alejandro.sh" -paste table.$TTHERMfile $tableINPUTs $tableIPs > $bigTABLE - -# compute normalized quantities -MIN=$(sort -n -k 4 $bigTABLE | head -1 | awk '{print $4}') - -# find how many genes have MIN gene size -MINgenes=$(awk -v min="$MIN" '{ if($4==min) print $0}' $bigTABLE | wc -l) -echo "Minimum gene size found: "$MIN -echo "Number of genes with MIN gene size "$MINgenes - -# $5: nbr reads for INPUT, $6: nbr of reads for IP; $4: gene size -##awk -v min="$MIN" '{print $5*min/$4" "$6*min/$4}' $bigTABLE > $normalizedTABLE -##awk '{print $5*150./$4"\t"$6*150./$4}' TABLEE > normalizedTABLE -#paste $bigTABLE $normalizedTABLE > $FINALtable -### this calculation was doing a previous normalization using the MIN.gene.size -### which is obsolete and NOT needed anymore... -### awk -v min="$MIN" '{print $0 $5*min/$4"\t"$6*min/$4}' $bigTABLE - - -#awk '{if ($7==0) print "--"; else print $8/$7}' FINALtable > scores -#paste $TABLEE $normalizedTABLE | awk '{if ($7==0) print "--"; else print $8/$7}' > $scores -#paste FINALtable scores | sort -k 9 -n > SUPERfinal -#paste $TABLEE $normalizedTABLE | awk '{if ($7==0) print "--"; else print $8/$7}' | sort -k 9 -n - -awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0) print $0"\t""--"; else print $0"\t"$8/$7}' > $finalTABLE--SORTED -awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0) print $0"\t""--"; else print $0"\t"$8/$7}' | sort -k 9 -n > $finalTABLE - - -### ADD headers to tables... -#header="location \t name \t description \t GeneSize \t INPUT.reads \t IP.reads \t norm.INPUTs \t norm.IPs \t IP-INPUT.ratio" -header="location \t name \t description \t GeneSize \t ${1}_reads \t ${1}_reads" -for table in {$finalTABLE--SORTED,$finalTABLE}; do - echo -e $header | cat - $table > tmp && mv tmp $table -done - - -#### -# for generating table to use with intergenic regions script: -#paste FINAL.table.BD2_IP_NGS-BD2_INPUT_NGS--UNSORTED /scratch2//s/scinet/mponce/Tetrahymena_Ryerson/1stAttempt/FINAL.table.BD1_IP_NGS-BD1_INPUT_NGS--UNSORTED | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"(($9+$18)*.5) }' > combinedTABLES_BD1-BD2 diff --git a/core/countReads-wFns.sh b/core/countReads-wFns.sh deleted file mode 100755 index b32cad1..0000000 --- a/core/countReads-wFns.sh +++ /dev/null @@ -1,254 +0,0 @@ -#!/bin/bash -# -vx - -# main script used to analize and call (count) peaks in the T.Thermophila data. -# This script also calls 2 other scripts: "table.sh" and "comb_tables.sh" -# The script requires BWA and samtools to be installed as they are used within the pipeline. - -################################# -### how to use it -# -# 1st argument: file with INPUT reads -# 2nd argument: file with IP reads -# 3rd argument: reference genome file (fasta) -# 4th argument: annotiation file (gff3) -# 5th argument: working space (if possible use RAMdisk --ie. /dev/shm/--, or /tmp in a computer with SSD) -# 6th argument (optional): number of cores to use for BWA multi-threading -# -# Examples: -# time PATHtoRACSrepo/core/countPeaks.sh _1_MED1_INPUT_S25_L007_R1_001 _3_MED1_IP_S27_L007_R1_001 T_thermophila_June2014_assembly.fasta T_thermophila_June2014.gff3 /tmp/ 16 -# time PATHtoRACSrepo/core/countPeaks.sh _1_MED1_INPUT_S25_L007_R1_001 _3_MED1_IP_S27_L007_R1_001 T_thermophila_June2014_assembly.fasta T_thermophila_June2014.gff3 /dev/shm/ 16 -# -################################# - - -### functions ######################################### -LOChelpMsg(){ - echo "How to use this script:" - cancel; -} -# -# -LOCcheckTools(){ - BWA=$(which bwa); - [ -z $BWA ] && errMsg "BWA needs to be installed for using this pipeline!" - SAMTOOLS=$(which samtools) - [ -z $SAMTOOLS ] && errMsg "SAMTOOLS needs to be installed in the system, as it is required in the pipeline!" -} - -##### - -LOCcheckIntegrityPipeline(){ - scriptsDIR=`dirname $0` - [ -z $scriptsDIR/table.sh ] && errMsg "missing 'table.sh' from pipeline directory $scriptsDIR" - [ -z $scriptsDIR/comb_tables.sh ] && errMsg "mssing 'comb_tables.sh' from pipeline directory $scriptsDIR" -} - -# -####################################################### -# setting preamble, detecting scripts location -scriptsDIR=`dirname $0` - -# load auxiliary fns for integrity checks and message/error handling -if [[ -f $scriptsDIR/auxs/auxFns.sh ]]; then - . $scriptsDIR/auxs/auxFns.sh --source-only; -else - echo "Error auxiliary file: " $scriptsDIR/auxs/auxFns.sh "NOT found!" - exit -fi -####################################################### - -#### CHECKS ########################################### -welcome - -### CHECK arguments -if [[ $# -eq 0 ]]; then - errMsg "No arguments supplied!"; -fi -# -if [[ $# -lt 5 ]]; then - errMsg "Five mandatory arguments are needed!"; -fi - -### CHECK software -checkIntegrityPipeline table.sh comb_tables.sh -checkTools bwa samtools -###################################################### - -# ==================================================== -# DEFINE SOME env.VARIABLES -#dataDIR=DATA -#dataDIR=data2 -#[ -d $1 ] && dataDIR=$1 || errMsg "1st argument should be a directory, where the reference data to be used is located" - -## reference file -#fastaFILE=$dataDIR/T_thermophila_June2014_assembly.fasta # <==== this could be another argument!!! -# -#faiFILE=$fastaFILE.fai - -## target files -## TO BE READ as arguments... $1 (INPUT) and $2 (IP) -[ -s $1 ] && INPUTfile=`basename $1` || errMsg "2nd argument should be a file with the INPUT reads" -[ -s $2 ] && IPfile=`basename $2` || errMsg "3rd argument should be a file with the IP reads" -[ -s $3 ] && FASTAfile=`basename $3` || errMsg "4th argument should be a file with the reference genome file (fasta)" -[ -s $4 ] && REFfile=`basename $4` || errMsg "5th argument should be an annotation file (gff3)" - -faiFILE=$FASTAfile -fastaFILE=$FASTAfile - -inputFILE1=INPUTfile -inputFILE2=IPfile - -# could be RAMdisk or TEMP in a SSD -# use RAMDISK -#RAMdisk=/dev/shm/$USER/ -#RAMdisk=/tmp/$USER/ -RunTimeDir=ORF_RACS-$$-`date '+%Y%m%d-%H%M%S'` -[ -d $5 ] && WORKINGdir=$5/$USER/$RunTimeDir || errMsg "6th argument must be a directory, we suggest using /dev/shm for RAMdisk or /tmp in a SSD device, as this pipeline is quite I/O intense!" -echo "Working directory --> " $WORKINGdir - - -# number of threads to use with BWA -cores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l) -#[ -z $6 ] && NT=$6 || NT=$cores -if [ " "$6 != " " ]; then NT=$6 ; else NT=$cores; fi -echo "BWA will use nbrCores="$NT", when possible" - -## ===== - -#inputFILE1=$INPUTfile #.fastq.gz -#[ -s $inputFILE1 ] || errMsg $inputFILE1" not found" -#inputFILE2=$IPfile #.fastq.gz -#[ -s $inputFILE2 ] || errMsg $inputFILE2" not found" - -# save original directory -myDIR=$(pwd) -resultsDIR=$myDIR/ORF_RACS_results-`date '+%Y%m%d-%H%M%S'` #`date +%D-%T` -scriptsDIR=`dirname $0` - - -# use RAMDISK -#RAMdisk=/dev/shm/$USER/ -#RAMdisk=/tmp/$USER/ - -echo # create local space on RAMDISK -mkdir -p $WORKINGdir -# copy data into WORKINGdir -#cp -pr ./DATA $RAMdisk -cp -prv $1 $WORKINGdir/$inputFILE1 -cp -prv $2 $WORKINGdir/$inputFILE2 -cp -prv $3 $WORKINGdir/$FASTAfile -cp -prv $4 $WORKINGdir/$REFfile -# move to WORKINGdir -cd $WORKINGdir - - -# outputs -outputINPUTbam=aln$INPUTfile.bam -outputINPUTbamSORTED=aln$INPUTfile-sorted.bam -outputINPUTsam=aln$INPUTfile.sam -#outputIPbam=alnBD1_IP.bam -#outputIPbamSORTED=alnBD1_IP-sorted.bam -#outputIPsam=alnBD1_IP.sam -outputIPbam=aln$IPfile.bam -outputIPbamSORTED=aln$IPfile-sorted.bam -outputIPsam=aln$IPfile.sam -#output3gzip=/dev/shm/$USER/alnBD1_INPUT.tar.gz - -# tables files -tableINPUTs=tableReadsINPUT.`basename $INPUTfile .fastq.gz` -tableIPs=tableReadsIP.`basename $IPfile .fastq.gz` - -####===================================================== - - -## pipeline -# index the assembly -# generates 4 aux. files (in principle could be removed! -- *.fasta.*) -$BWA index $fastaFILE -#rm *.fasta.* - -# align the INPUT to assembly -# bwa -> multithreaded... (upto 4) -$BWA aln -t $NT $fastaFILE $inputFILE1 > $outputINPUTbam - -# convert to sequence coordinate for INPUT -$BWA samse $fastaFILE $outputINPUTbam $inputFILE1 > $outputINPUTsam - -# remove inputFILE1 in case of memory issues -#rm -v $inputFILE1 - -####### -# zip file in RAMDISK -#tar cvzf $output3gzip $output2sam -###### - -# align the IP file to the assembly -# bwa -> multithreaded... (upto 4) -$BWA aln -t $NT $fastaFILE $inputFILE2 > $outputIPbam - -# convert to sequence coordinate for IP -$BWA samse $fastaFILE $outputIPbam $inputFILE2 > $outputIPsam - -# remove inputFILE2 in case of oom -#rm -v $inputFILE2 - - -# FAI index to the assembly -$SAMTOOLS faidx $fastaFILE - - -# align assembly witht the sam and bam files -$SAMTOOLS import $faiFILE $outputINPUTsam $outputINPUTbam -$SAMTOOLS import $faiFILE $outputIPsam $outputIPbam - -# sorting for IGV -$SAMTOOLS sort $outputINPUTbam -o $outputINPUTbamSORTED -$SAMTOOLS sort $outputIPbam -o $outputIPbamSORTED - -$SAMTOOLS index $outputINPUTbamSORTED -$SAMTOOLS index $outputIPbamSORTED - - -# define table file for extracting genes... passed as third argument to the script!!! -table=table.$REFfile -# extract the reads -touch $tableINPUTs #tableReadsINPUT -# read scaffolds from table generated by the "table" script and loop over the sorted BAM files -# NEEDS the file "table" generated by the script 'table.sh' // if it doesn't exist, launch its creation in parallel -### [ ! -e $resultsDIR/$table ] || $scriptsDIR/table.sh $REFfile && cp -v $resultsDIR/$table . -$scriptsDIR/table.sh $REFfile -[ $? -ne 0 ] && errMsg "' $scriptsDIR/table.sh $REFfile' FAILED...! exitcode $?" - -for i in `awk '{print $1}' < $table `; do - #echo $i; - $SAMTOOLS view $outputINPUTbamSORTED $i | wc -l >> $tableINPUTs #tableReadsINPUT -done - -touch $tableIPs #tableReadsIP -for i in `awk '{print $1}' < $table `; do - #echo $i; - $SAMTOOLS view $outputIPbamSORTED $i | wc -l >> $tableIPs #tableReadsIP -done - - -#remove temporal files... -rm -v *.fasta.* - -### copy data into original directory -mkdir -p $resultsDIR -cp $outputINPUTbamSORTED $outputIPbamSORTED $resultsDIR -cp $outputINPUTsam $outputIPsam $resultsDIR -cp $tableINPUTs $resultsDIR -cp $tableIPs $resultsDIR -# need the index files for intergenic regions determination -cp *bai $resultsDIR - -# FINAL step, combine tables together... -#. $scriptsDIR/comb_tables.sh _2_MED2_INPUT_S26_L007_R1_001 _4_MED2_IP_S28_L007_R1_001 T_thermophila_June2014.gff3 -$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile -[ $? -ne 0 ] && errMsg "'$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile' FAILED...! exitcode $?" -cp -v FINAL.table.* $resultsDIR - -# clean temp dir used -rm -rfv $WORKINGdir diff --git a/core/intergenic/det-interGenes-ORIG.sh b/core/intergenic/det-interGenes-ORIG.sh deleted file mode 100755 index 12d6123..0000000 --- a/core/intergenic/det-interGenes-ORIG.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash - -# det-interGenes.sh -# main script in charge of determining the intergenic regions -# this part of the pipeline requires the table generated by the ORF reads/counts -# -# -# HOW TO USE THIS SCRIPT: -# arg1: final combined table gnerated by the ORF from the RACS pipeline -# arg2: reference genome file (gff3) -# arg3: name of the file where to save the InterGenic Regions -# arg4/5: name of the output file from ORF part/tag.(ref.) name -# arg6/7: ... -# ... -# -# eg. -# det-interGenes.sh combinedTABLES_MED1-MED2 dataset/T_thermophila_June2014.sorted.gff3 interGENs_MED1-MED2.csv -# Fillingham_1_MED1_INPUT_S25_L007_R1_001 INPUTm1 \ -# Fillingham_2_MED2_INPUT_S26_L007_R1_001 INPUTm2 \ -# Fillingham_3_MED1_IP_S27_L007_R1_001 IPm1 \ -# Fillingham_4_MED2_IP_S28_L007_R1_001 IPm2 \ -# ... -# - -################################################# -# Setting preamble, detecting scripts location -scriptsDIR=`dirname $0` - -################################################# -# load auxiliary fns for integrity checks and message/error handling -. $scriptsDIR/../auxs/auxFns.sh --source-only - -#### CHECKS ##################################### -### CHECK arguments -if [[ $# -eq 0 ]]; then - errMsg "No arguments were supplied!"; -fi -# - -### INTEGRITY CHECK -# check RACS scripts -checkIntegrityPipeline intergenic/det-interGenes.sh intergenic/interGeneRegions.R intergenic/utils_RACS-IGR.R intergenic/interGenes.sh -# check external tools needed -checkTools samtools Rscript -################################################# - -# getting command line arguments - -combTABLE=$1 #combinedTABLES_MED1-MED2 -refFILE=$2 #dataset/T_thermophila_June2014.sorted.gff3 -interGENregions=$3 #interGENs_MED1-MED2.csv -- output from Rscript -# -INPUTfile1=$4 #Fillingham_1_MED1_INPUT_S25_L007_R1_001 -tag1=$5 #INPUTm1 -INPUTfile2=$6 #Fillingham_2_MED2_INPUT_S26_L007_R1_001 -tag2=$7 #INPUTm2 -INPUTfile3=$8 #Fillingham_3_MED1_IP_S27_L007_R1_001 -tag3=$9 #IPm1 -INPUTfile4=${10} #Fillingham_4_MED2_IP_S28_L007_R1_001 -tag4=${11} #IPm2 -# -combinedFinalIGenicTable=${interGENregions}_INPUTs-IPs`date '+%Y%m%d-%H%M%S'`.csv -# ================================================ - -# step #0: look into combTABLE and generate list of intergenic regions -t0=$(time Rscript $scriptsDIR/interGeneRegions.R $combTABLE $refFILE $interGENregions) - -# process each of the samples in comparison to the intergenic regions just generated in 'step #0' -t1=$(time $scriptsDIR/interGenes.sh $tag1 $INPUTfile1 $interGENregions) -#t1=$(time . $scriptsDIR/interGenes.sh INPUTm1 Fillingham_1_MED1_INPUT_S25_L007_R1_001 interGENs_MED1-MED2.csv ) -t2=$(time $scriptsDIR/interGenes.sh $tag2 $INPUTfile2 $interGENregions) -#t2=$(time . $scriptsDIR/interGenes.sh INPUTm2 Fillingham_2_MED2_INPUT_S26_L007_R1_001 interGENs_MED1-MED2.csv ) -t3=$(time $scriptsDIR/interGenes.sh $tag3 $INPUTfile3 $interGENregions) -#t3=$(time . $scriptsDIR/interGenes.sh IPm1 Fillingham_3_MED1_IP_S27_L007_R1_001 interGENs_MED1-MED2.csv ) -t4=$(time $scriptsDIR/interGenes.sh $tag4 $INPUTfile4 $interGENregions) -#t4=$(time . $scriptsDIR/interGenes.sh IPm2 Fillingham_4_MED2_IP_S28_L007_R1_001 interGENs_MED1-MED2.csv ) - -# combine tables with INPUT/IP reads -#t5=$(time paste interGENs_MED1-MED2.csv interGENs-Fillingham_1_MED1_INPUT_S25_L007_R1_001 interGENs-Fillingham_3_MED1_IP_S27_L007_R1_001 interGENs-Fillingham_2_MED2_INPUT_S26_L007_R1_001 interGENs-Fillingham_4_MED2_IP_S28_L007_R1_001 > interGENs_MED1-MED2_INPUTs-IPs.csv ) -t5=$(time paste $interGENregions interGENs-$INPUTfile1 interGENs-$INPUTfile3 interGENs-$INPUTfile2 interGENs-$INPUTfile4 > $combinedFinalIGenicTable) - -# statistics on running times -echo "Partial Times: $t0, $t1, $t2, $t3, $t4" -echo Total time: $t0+($t1+$t2+$t3+$t4)+$t5 - -