:78 Function create_function() is deprecated [8192]
diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..6a85685 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,3 @@ +MARCELO PONCE (mponce@scinet.utoronto.ca) +ALEJANDRO SAETTONE (alejandro.saettone@ryerson.ca) + diff --git a/CITATION b/CITATION new file mode 100644 index 0000000..9e515e9 --- /dev/null +++ b/CITATION @@ -0,0 +1,2 @@ +Please cite our paper (CITE_RACS) when ever you are using RACS. + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..74fc02c --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +Copyright 2018 Marcelo Ponce, Alejandro Saettone + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +- Please cite our paper (CITE_RACS) when ever you are using RACS. + + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + diff --git a/README b/README new file mode 100644 index 0000000..227db41 --- /dev/null +++ b/README @@ -0,0 +1,358 @@ +RACS: Rapid Analysis of ChIP-Seq data for contig based genomes + +These tools are a serie of scripts developed to facilitate the analysis +of ChIP-Seq data and has been applied to the organism T. Thermophila. + +The scripts should work in any Linux-type OS (ie. Mac OS and Linux OS). +It requires to have installed the following programs: + - SAMtools + - BWA + - R + +The first two are needed for the determination of reads in open region frames -ORF- (genic regions). +While SAMtools and R are needed for the determination of intergenic regions -IGR-. + +The main scripts are located in the "core" directory. +Additionally we provide other subdirectories containing the following: + - "hpc": submission scripts for typical job managers and schedulers in HPC environments + - "tools": comparison tools developed to compare against other software, eg. MACS + - "datasets": examples of the data used to test and run our pipeline + +This pipeline is open source under the MIT license, and researchers are welcome to use it. +We will appreciate if users can let us know if they found bugs or could provide feedback about +its use and experience using RACS. +Please cite our paper (CITE_RACS) when ever you are using RACS. + + + +* INSTALLATION + +The scripts composing this pipeline, are available as open source tools, +and can be obtained from any of the following repositories: + + https://gitrepos.scinet.utoronto.ca/public/RACS.git + +or + + https://bitbucket.org/mjponce/racs + +Both repositories are synchronized, so that the latest version of RACS will be avaialble in both +repositiories simultaneously. + +To obtain and install a copy of RACS in your computer, open a terminal (you will need git and +a internet connection!) and type: + + git clone https://gitrepos.scinet.utoronto.ca/public/RACS.git + +that should clone (download and copy) the latest version of our pipeline in your computer, +creating a directory called "RACS", containing the following files and sub-directories: + +`` +RACS + âââ AUTHORS + âââ CITATION + âââ LICENSE + âââ README + âââ WARRANTY + âââ core + |  âââ countReads.sh + |  âââ table.sh + |  âââ comb_tables.sh + | âââ auxs + | | âââ auxFns.sh + | | âââ testFns.sh + | âââ intergenic + | | âââ det-interGenes.sh + | | âââ interGeneRegions.R + | | âââ interGenes.sh + | âââ test + | âââ lst + âââ datasets + |  âââ + |  âââ + |  âââ + âââ hpc + â  âââ submission.pbs + |  âââ submission.slurm + | âââ IGR-jobs_parallel.gnu + | âââ modules + âââ tools +   âââ +   âââ +   âââ +'' + + +Updating RACS: +Because the RACS pipeline is available under version control (git), you can easily get updates +and the latest additions to the RACS pipeline, by simply typing the following command in the +RACS directory: + + git pull + +This will bring into your local installation the latest updates to the pipeline. +Similarly, any of the git functionalities should work as expected, eg. git log, git status, etc. + + + +* Integrity checks +For integrity purposes we have included a series of tools to check whether internal scripts' +dependencies, as well as, external programs are available in the system so that RACS functionality +is guaranteed. +Some of RACS routines will run this integrity checks when ever appropriate at the moment of execution. +The user can also test this, by executing the following command in the shell: + + PATHtoRACSrepo/core/auxs/testFns.sh + +at this point a series of checks will be executed and the results shown in the screen, eg. + +*** CHECKING RACS pipeline INTERNAL INTEGRITY... + countReads.sh ... ok! + table.sh ... ok! + comb_tables.sh ... ok! + intergenic/interGenes.sh ... ok! + intergenic/interGeneRegions.R ... ok! + intergenic/det-interGenes.sh ... ok! + auxs/testFns.sh ... ok! + auxs/auxFns.sh ... ok! +*** CHECKING EXTERNAL DEPENDENCIES with auxiliary software: BWA, SAMTOOLS & RSCRIPT... + bwa ... found! + samtools ... found! + Rscript ... found! + +If either one of the dependencies is not meet an error message describing the issue will be displayed, +followed by the basic instructions in how to use RACS. + + + +* HOW TO USE THE 'racs' PIPELINE: + +RACS offers two main functionalities: +- count reads in ORF +and +- identify reads in intergenic regions (using information about the biology of the model) + + +* Counting Reads in ORF: + +- "countReads": countReads.sh is the main script in the pipeline, it is a shell + script in charge of implementing and distributing the actual pipeline to count +the reads. It combines several instances of shell commands and offloads +to the packages SAMtools and BWA. +This script can utilize multi-core parallelization when possible via threads +which can be explicitly input by the user or automatically detected by the script. + +At the end of the execution the script will produce a table indicating XXXXX. +The resulting files, some intermediate files and the final tables, are located +in a directory created in the directory from where the script is executed named +'ORF_RACS_results-YYYYMMDD-HHMMSS' where 'YYYYMMDD-HHMMSS' indicates the year-month-day +and time when the script was executed. + + +Inputs: + - INPUT file (fastq.gz) + - IP file (fastq.gz) + - fasta assembly file (.fasta) + - reference genome file (.gff3) + +Outputs: + - several sam, bam and bai files for each input (INPUT and IP) files + - intermediate tables with reads for INPUT and IP files, named 'tableReads{INPUT/IP}.{input/ip-filename}' + - final table sorted by scaffold and localization within the scaffold, + named 'FINAL.table.{INPUTfilename}-{IPfilename} + + +When executing the scripts, please indicate the full path to the location of +the script. In this way the script will determine where the repository is +located and find the different tools needed during the execution. +Also, please do not "source" the scripts, they are set to be run as executable scripts, +ie. execute the scripts in this way, + + PATHtoRACS/core/../scriptNAME.sh args + +do *NOT* execute them in this way, + + . PATHtoRACS/core/../scriptNAME.sh args + +NOR in this way neither, + + source PATHtoRACS/core/../scriptNAME.sh args + +Our scripts will also try to detect this situation and prevent the user from doing this. +Due to the modular implementation we designed for RACS, and in order to allow RACS +locating their main components, we need to have the scripts executed as described above +and not sourced. + + +Integrity & Sanity Checks +In each of the main RACS scripts we have incorporated several integrity checks which +are run at the very beginning of the scripts' execution. These tests include +determining whether the tools used (eg. BWA, SAMTOOLS or R) are installed in the +system and available for being used within the pipeline. +Similarly, by checking the location of the script, the pipeline verifies that the +other components of the pipeline are also in place and can be found so that the pipeline +can run without any problems. +In this way, there is no need to add the different scripts of the pipeline to the +PATH and the scripts are selfaware of where they are placed. +For achieving this, the scripts will need to be called specifying its full location in the computer. + +The different scripts in the pipeline will also check that the arguments specified, +in particular when these suppose to be existent files, they actually are! +We basically tried our best to implement defensive programming all across the different scripts +that compose the RACS pipeline, to protect its proper execution and help the user to establish +the proper way of using RACS. +We also included a simple testing routine, in the 'aux' subdir, that can be used to run +some of the integrity tests that the pipeline will be checking during execution, +as described in the section above. + + +Arguments to the script: + 1st argument: file with INPUT reads, usually a ".fasta.gz" file, obtained from the sequencer + 2nd argument: file with IP reads, usually a ".fasta.gz" file, obtained from the sequencer + 3rd argument: reference genome file (fasta) + 4th argument: annotation file (gff3) + 5th argument: working space (if possible use RAMdisk --ie. /dev/shm/--, or + /tmp in a computer with SSD) + 6th argument (optional): number of cores to use for BWA multi-threading. + If this argument is not specified, the script will attempt to determine + automatically the number of cores and use that number to run in multi-threading. + If you want to run the script serially, ie. with just one core, you should + set this argument to 1. + + +The main output file, FINAL.table."INPUTfile"-"IPfile" +(where "INPUTfile" and "IPfile" refer to the corresponding INPUT and IP files), +is ordered by the scaffold localization. +Notice that this file <<FINAL.table."INPUTfile"-"IPfile">>, is the one that +will be used in the second part of the pipeline when detecting the intergenic regions. +This final file is a CSV text file containing the following columns: + + location name description geneSize INPUT.reads IP.reads + +where + "location" is the scaffold location of the gene + "name" is the name of the gene + "description" is a description of the gene + "geneSize" represents the size of the gene + "INPUT.reads" is the number of reads (calls) for the INPUT file + "IP.reads" is the number of reads (calls) for the IP file + + + +* Normalization of ORF by PF Cluster Scores +If your data contains the Passing Filter (PF) Cluster scores, you can use an additional shell script +that we have included to normalize your INPUT/IP-reads. +This script is called "normalizedORF.sh" and is located in the core subdirectory. +The script requires three mandatory arguments: + 1st argument: "FINAL.table.*" file generated from the RACS' ORF pipeline + 2nd argument: "PF-INPUT-value" PF value correspoding to the INPUT file + 3rd argument: "PF-IP-value" PF value correspoding to the IP file + +Please notice that arguments 2 and 3, are the acutal numerical values corresponding to the PF clusters +for the INPUT and IP respectively. + + +* Determination of InterGenic Regions +The second main functionality of our RACS pipeline is the ability to determine +intergenic regions based on the results from the ORF reads and the biology of the +model organism. +For achieving this second part, we combined a series of shell scripts and R scripts. +Similarly as before the pipeline when executed will perform some sanity and integrity +checks to guarantee that all the necessary pieces are in place. +All the scripts for this part of the pipeline are located in the 'intergenic' subdir. +The flow's director is a simple shell script, 'det-interGenes.sh', which will receive +four command line arguments and call an R script, 'interGenicRegions.R' and +a second shell script by the end of the execution, 'interGenes.sh'. +The R script, 'interGenicRegions.R', is the actual brain and main script in charge of +determining regions among genic boundaries. +It is designed in a modular fashion, implementing the script in a main driver script +(the 'interGenicRegions.R' itself) and an utility file where all the functions used in +the main driver are defined. +The second shell script, 'interGenes.sh', is in charge of computing the number of +reads/calls within the determined intergenic regions, for which it also uses SAMtools. +The main shell script, keeps track of the time employed in each stage of the process, +and offers a summary of it at the end of the execution. + +Arguments to the script: + arg1: final combined table generated by the ORF from the RACS pipeline + arg2: reference genome file (gff3) + arg3: name of the file where to save the InterGenic Regions + arg4: text file containing the name of the output file from ORF part/tag.(ref.) name; eg. + alnDATASET_INDICATORS_ChIP_SXX_RY_ZZZ.fastq.gz-sorted.bam + alnDATASET_INDICATORS_Input_SXX_RY_ZZZ.fastq.gz-sorted.bam + + + + + +Examples: +I) calling peaks for ORF +I.i) the following command will run the countReads.sh script using: + - 'data2/_1_MED1_INPUT_S25_L007_R1_001.fasta.gz' as the file with the INPUT reads + - 'data2/_3_MED1_IP_S27_L007_R1_001.fasta.gz' as the file with IP reads + - 'T_thermophila_June2014_assembly.fasta' as the reference genome for the T.Thermophila organism + - 'T_thermophila_June2014.gff3' as the annotation file for the T.Thermophila + - '/tmp/' as working space + + PATHtoRACSrepo/core/countReads.sh data2/_1_MED1_INPUT_S25_L007_R1_001.fastq.gz data2/_3_MED1_IP_S27_L007_R1_001.fastq.gz T_thermophila_June2014_assembly.fasta T_thermophila_June2014.gff3 /tmp/ + + +I.ii) the following command will run the countReads.sh script using the same +arguments as before but it is specifying to use "/dev/shm" (RAMdisk) instead +of "/tmp" as temporary storage for auxiliary files, and 16 threads in the +parallel regions of the pipeline. +Additionally it is using the system's 'time' command to time how long the +pipeline takes to run. + + time PATHtoRACSrepo/core/countReads.sh data2/_1_MED1_INPUT_S25_L007_R1_001.fastq.gz data2/_3_MED1_IP_S27_L007_R1_001.fastq.gz T_thermophila_June2014_assembly.fasta T_thermophila_June2014.gff3 /dev/shm/ 16 + + +II) Determination of InterGenic Regions (IGR) +II.i) the following command will determine the intergenic regions, using: + - 'combinedTABLES_MED1-MED2' as the table which was determined in the ORF part of the pipeline + - 'dataset/T_thermophila_June2014.sorted.gff3' as the micro-organism reference genome file + - 'interGENs_MED1-MED2.csv' is the name of the table that the IGR part of the pipeline will + generate, ie. this will be the output of this part of the pipeline + - 'samples.file' is a text file containing the name of the BAM output file, also generated when + running the ORF part from RACS; they are usually named as + alnDATASET_INDICATORS_ChIP_SXX_RY_ZZZ.fastq.gz-sorted.bam + alnDATASET_INDICATORS_Input_SXX_RY_ZZZ.fastq.gz-sorted.bam + + PATHtoRACSrepo/core/intergenic/det-interGenes.sh combinedTABLES_MED1-MED2 dataset/T_thermophila_June2014.sorted.gff3 interGENs_MED1-MED2.csv samples.file + + +II.ii) we included a submission script in the 'hpc' directory, named "IGR-jobs_parallel.gnu", +which will basically scan the current directory digging for sub-directories named "ORF_RACS_*", +which is the way RACS will name the outcome from running the RACS' ORF pipeline. +When the script detects one of these directories, it will look inside it and will generate the +corresponding 'samples.file' containing the name of all the aln*fastq.gz-sorted.bam files within +this directory. +When the search for ORF sub-directories is done, it will launch the IGR part of the pipeline in +*parallel* for ALL the IGR results using an 'embarrassingly parallel' approach via GNU-Parallel. +Assuming you are located in a directory containing several ORF sub-directories, you will just run it +in this way, + + PATHtoRACSrepo/hpc/IGR-jobs_parallel.gnu + +In principle the script does not require any command line argument, but it contains several environment +variables defined that should be adjusted for the particular system where it will run. +Ie. + - RACSloc="location where RACS is installed in your system" + - REFgff3="location where the genome reference file of the organism is located in your system" + +Additionally one could adjust the following variables: + - PARALLEL_TASKS_PER_NODE="number of parallel jobs to run at the same time", although + by the default the script will try to determine the maximum number of cores available + on the system + - ORFdirs="ORF_RACS_results-", matching pattern for the beginning of the directories generated + by the RACS' ORF + + +III) normalization and cut-off with negative control + + +IV) normalization and cut-off without a negative control + + +V) comparison to MACS and XXXX + diff --git a/WARRANTY b/WARRANTY new file mode 100644 index 0000000..1b09e22 --- /dev/null +++ b/WARRANTY @@ -0,0 +1,8 @@ +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/core/auxs/auxFns.sh b/core/auxs/auxFns.sh new file mode 100755 index 0000000..965336e --- /dev/null +++ b/core/auxs/auxFns.sh @@ -0,0 +1,160 @@ +#!/bin/bash +## -vx + +# auxiliary set of functions used in other scripts +# it includes: +# - welcome(): RACS welcome message and credits +# +# - helpMsg(): help message describing how to use this script +# +# - usage(): an improved self-documented help fn based on the same scripts' comments +# +# - checkTools(): fn to check the availability of a given tool +# by looking whether such an executbale/program is present in the current system; +# the fn can process several arguments at once +# +# - checkIntegrityPipeline(): fn to check internal scripts to the pipeline, +# the fn will check whether an specific script exists and has an executable attribute; +# the fn can process several arguments at once +# +################################# + + +### functions ######################################## + +##### Informative fucntions +welcome() { + echo "RACS v1.0 (2018/2019) -- Open source tools for Analizing ChIP-Seq data" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +} + +## + +helpMsg(){ + scriptsDIR=`dirname $0` + welcome + echo $scriptsDIR + echo "How to use this script:" + more $scriptsDIR/../../README + cancel; +} +## +usage() { + shebang="#!/bin/bash" + scriptName=`basename $0` + scriptsDIR=`dirname $0` + welcome + echo "************************" + echo $scriptName + echo "************************" + echo "How to use this script:" + upto=$(grep -v $shebang $0 | grep -n "####################" | head -1 | awk -F":" '{print $1}') + grep -v $shebang $0 | head -$upto | more + exit 222 +} + +## + +##### Checking/Testing Functions +checkTools(){ + echo "checking for tools required for RACS..." + for tool in "$@"; do + # look for the 'tool' in executable path + statusTOOL=$(which $tool); + # if exit-status is non-zero tool NOT found! + [ -z $statusTOOL ] && errMsg "$tool needs to be installed for using this pipeline!" || echo " $tool ... found!" + done +} + +##### + +checkIntegrityPipeline(){ + echo "Verifying RACS integrity..." + scriptsDIR=`dirname $0` + for dep in "$@"; do + # this will make the fn look in the location from where the fn is being called, ie. relative to the location of the calling script... + testing=$scriptsDIR/$dep + #echo $testing + # check file exists and has execution attribute + [ -x $testing ] && echo " $dep ... ok!" || errMsg "err.critical: missing '$dep' from pipeline directory $scriptsDIR: $testing" + done +} + +######### + +##### +# Functions for checking command-line arguments +##### + +# Function that checks whether the argument is an existing file +checkFile() { + for file in "$@"; do + echo 'verifying file...' $file + [ ! -f $file ] && errMsg "File $file NOT found!" + done +} + + +######### + +# Function that checks that the argument is of type numeric, ie. an integer number +checkNbr() { + numericDigits='^[0-9]+$' + if ! [[ $1 =~ $numericDigits ]] ; then + errMsg "$2 argument should be a positive integer number! Check your PF cluster information!" + fi +} + +######### + +#Function that checks whether the argument is of a particular type +checkOption() { + if [[ $1 != "A" && $1 != "D" ]] ; then + errMsg "$2 argument should be either 'A' for ascending or 'D' for decreasing order!" + fi +} + +######### + +###### +# Other miscellaneous fns +###### + + +# Function to detect number of cores + +detectCores() { + nbrCores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l) + echo $nbrCores +} + + +# functions to time execution + +tick() { + t0=`date +%s` +} + +tock(){ + t1=`date +%s` + echo $((t1-t0)) +} + +######################################################## + +###################################################### +# helper fns for handling error messages +pausee() { echo "----------------------------------------"; read -p "$*"; } +msg() { echo "$0: $*" >&2; } +errMsg() { msg "$*"; pausee 'Press [Enter] to continue...'; usage; exit 111;} +try() { "$@" || errMsg "cannot $*"; } +####################################################### + +###################################################### + +# ==================================================== +myDIR=$(pwd) +resultsDIR=$myDIR/results-`date '+%Y%m%d-%H%M%S'` #`date +%D-%T` + + diff --git a/core/auxs/testFns.sh b/core/auxs/testFns.sh new file mode 100755 index 0000000..3ce95cd --- /dev/null +++ b/core/auxs/testFns.sh @@ -0,0 +1,36 @@ +#!/bin/bash + + +# unit testing (integrity tests) for the pipeline +# +# script containing a series of integrity checks: internal and external dependencies of the pipeline + +# determine where the scripts are located +scriptsDIR=`dirname $0` + +# load auxiliary fns +. $scriptsDIR/auxFns.sh --source-only + + +main() { + # integrity checks + echo "*** CHECKING RACS pipeline INTERNAL INTEGRITY..." + for i in `cat $scriptsDIR/../test/lst`; do + #echo $i; + checkIntegrityPipeline ../$i; + done + + # external dependencies + echo "*** CHECKING EXTERNAL DEPENDENCIES with auxiliary software: BWA, SAMTOOLS & RSCRIPT..." + # checking requirements for the ORF part of the RACS pipeline + checkTools bwa samtools + # checking requirements for the intergenic part of the RACS pipeline + checkTools samtools Rscript +} + + +if [ "${1}" != "--source-only" ]; then + main "${@}" +fi + +######### diff --git a/core/comb_tables.sh b/core/comb_tables.sh new file mode 100755 index 0000000..65984cd --- /dev/null +++ b/core/comb_tables.sh @@ -0,0 +1,73 @@ +#!/bin/bash +###-xv + +# comb_tables.sh --- RACS ORF pipeline +# +# auxiliary internal script to RACS ORF pipeline, called from countReads.sh +# +# requires 3 arguments: +# 1) INPUTfile (eg. BD1_INPUT) +# 1) IPfile (eg. BD1_IP) +# 1) T_THERM file (eg. T_thermophila_June2014.gff3) +INPUTfile=`basename $1 .fastq.gz` +IPfile=`basename $2 .fastq.gz` +TTHERMfile=$3 + +tableINPUTs=tableReadsINPUT.$INPUTfile + +tableIPs=tableReadsIP.$IPfile + + +bigTABLE=TABLEE +#normalizedTABLE=normalized.table.$IPfile-$INPUTfile +#finalTABLE=FINAL.table.$IPfile-$INPUTfile +IxFILES=$INPUTfile-$IPfile +normalizedTABLE=normalized.table.$IxFILES +finalTABLE=FINAL.table.$IxFILES + +# combine tables generated by "table.sh" and "alejandro.sh" +paste table.$TTHERMfile $tableINPUTs $tableIPs > $bigTABLE + +# compute normalized quantities +MIN=$(sort -n -k 4 $bigTABLE | head -1 | awk '{print $4}') + +# find how many genes have MIN gene size +MINgenes=$(awk -v min="$MIN" '{ if($4==min) print $0}' $bigTABLE | wc -l) +echo "Minimum gene size found: "$MIN +echo "Number of genes with MIN gene size "$MINgenes + +# $5: nbr reads for INPUT, $6: nbr of reads for IP; $4: gene size +##awk -v min="$MIN" '{print $5*min/$4" "$6*min/$4}' $bigTABLE > $normalizedTABLE +##awk '{print $5*150./$4"\t"$6*150./$4}' TABLEE > normalizedTABLE +#paste $bigTABLE $normalizedTABLE > $FINALtable +### this calculation was doing a previous normalization using the MIN.gene.size +### which is obsolete and NOT needed anymore... +### awk -v min="$MIN" '{print $0 $5*min/$4"\t"$6*min/$4}' $bigTABLE + + +#awk '{if ($7==0) print "--"; else print $8/$7}' FINALtable > scores +#paste $TABLEE $normalizedTABLE | awk '{if ($7==0) print "--"; else print $8/$7}' > $scores +#paste FINALtable scores | sort -k 9 -n > SUPERfinal +#paste $TABLEE $normalizedTABLE | awk '{if ($7==0) print "--"; else print $8/$7}' | sort -k 9 -n + +### this calculation was doing a previous normalization using the MIN.gene.size +### which is obsolete and NOT needed anymore... +### awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0) print $0"\t""--"; else print $0"\t"$8/$7}' > $finalTABLE--SORTED +# in principle, there is only one final table ordered by scaffold loc. +#cp $bigTABLE $finalTABLE--SORTED + +### awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0) print $0"\t""--"; else print $0"\t"$8/$7}' | sort -k 9 -n > $finalTABLE +mv $bigTABLE $finalTABLE + +### ADD headers to tables... +#header="location \t name \t description \t GeneSize \t INPUT.reads \t IP.reads \t norm.INPUTs \t norm.IPs \t IP-INPUT.ratio" +header="location \t name \t description \t GeneSize \t ${1}_reads \t ${2}_reads" +#for table in {$finalTABLE--SORTED,$finalTABLE}; do +for table in $finalTABLE; do + echo -e $header | cat - $table > tmp && mv tmp $table +done + + +#### +# for generating table to use with intergenic regions script: +#paste FINAL.table.BD2_IP_NGS-BD2_INPUT_NGS--UNSORTED /scratch2//s/scinet/mponce/Tetrahymena_Ryerson/1stAttempt/FINAL.table.BD1_IP_NGS-BD1_INPUT_NGS--UNSORTED | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"(($9+$18)*.5) }' > combinedTABLES_BD1-BD2 diff --git a/core/comb_tables.sh~ b/core/comb_tables.sh~ new file mode 100644 index 0000000..840d4f4 --- /dev/null +++ b/core/comb_tables.sh~ @@ -0,0 +1,63 @@ +#!/bin/bash +###-xv + +# requires 3 arguments: +# 1) INPUTfile (eg. BD1_INPUT) +# 1) IPfile (eg. BD1_IP) +# 1) T_THERM file (eg. T_thermophila_June2014.gff3) +INPUTfile=`basename $1 .fastq.gz` +IPfile=`basename $2 .fastq.gz` +TTHERMfile=$3 + +tableINPUTs=tableReadsINPUT.$INPUTfile + +tableIPs=tableReadsIP.$IPfile + + +bigTABLE=TABLEE +#normalizedTABLE=normalized.table.$IPfile-$INPUTfile +#finalTABLE=FINAL.table.$IPfile-$INPUTfile +IxFILES=$INPUTfile-$IPfile +normalizedTABLE=normalized.table.$IxFILES +finalTABLE=FINAL.table.$IxFILES + +# combine tables generated by "table.sh" and "alejandro.sh" +paste table.$TTHERMfile $tableINPUTs $tableIPs > $bigTABLE + +# compute normalized quantities +MIN=$(sort -n -k 4 $bigTABLE | head -1 | awk '{print $4}') + +# find how many genes have MIN gene size +MINgenes=$(awk -v min="$MIN" '{ if($4==min) print $0}' $bigTABLE | wc -l) +echo "Minimum gene size found: "$MIN +echo "Number of genes with MIN gene size "$MINgenes + +# $5: nbr reads for INPUT, $6: nbr of reads for IP; $4: gene size +##awk -v min="$MIN" '{print $5*min/$4" "$6*min/$4}' $bigTABLE > $normalizedTABLE +##awk '{print $5*150./$4"\t"$6*150./$4}' TABLEE > normalizedTABLE +#paste $bigTABLE $normalizedTABLE > $FINALtable +### this calculation was doing a previous normalization using the MIN.gene.size +### which is obsolete and NOT needed anymore... +### awk -v min="$MIN" '{print $0 $5*min/$4"\t"$6*min/$4}' $bigTABLE + + +#awk '{if ($7==0) print "--"; else print $8/$7}' FINALtable > scores +#paste $TABLEE $normalizedTABLE | awk '{if ($7==0) print "--"; else print $8/$7}' > $scores +#paste FINALtable scores | sort -k 9 -n > SUPERfinal +#paste $TABLEE $normalizedTABLE | awk '{if ($7==0) print "--"; else print $8/$7}' | sort -k 9 -n + +awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0) print $0"\t""--"; else print $0"\t"$8/$7}' > $finalTABLE--SORTED +awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0) print $0"\t""--"; else print $0"\t"$8/$7}' | sort -k 9 -n > $finalTABLE + + +### ADD headers to tables... +#header="location \t name \t description \t GeneSize \t INPUT.reads \t IP.reads \t norm.INPUTs \t norm.IPs \t IP-INPUT.ratio" +header="location \t name \t description \t GeneSize \t ${1}_reads \t ${1}_reads" +for table in {$finalTABLE--SORTED,$finalTABLE}; do + echo -e $header | cat - $table > tmp && mv tmp $table +done + + +#### +# for generating table to use with intergenic regions script: +#paste FINAL.table.BD2_IP_NGS-BD2_INPUT_NGS--UNSORTED /scratch2//s/scinet/mponce/Tetrahymena_Ryerson/1stAttempt/FINAL.table.BD1_IP_NGS-BD1_INPUT_NGS--UNSORTED | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"(($9+$18)*.5) }' > combinedTABLES_BD1-BD2 diff --git a/core/countReads-wFns.sh b/core/countReads-wFns.sh new file mode 100755 index 0000000..b32cad1 --- /dev/null +++ b/core/countReads-wFns.sh @@ -0,0 +1,254 @@ +#!/bin/bash +# -vx + +# main script used to analize and call (count) peaks in the T.Thermophila data. +# This script also calls 2 other scripts: "table.sh" and "comb_tables.sh" +# The script requires BWA and samtools to be installed as they are used within the pipeline. + +################################# +### how to use it +# +# 1st argument: file with INPUT reads +# 2nd argument: file with IP reads +# 3rd argument: reference genome file (fasta) +# 4th argument: annotiation file (gff3) +# 5th argument: working space (if possible use RAMdisk --ie. /dev/shm/--, or /tmp in a computer with SSD) +# 6th argument (optional): number of cores to use for BWA multi-threading +# +# Examples: +# time PATHtoRACSrepo/core/countPeaks.sh _1_MED1_INPUT_S25_L007_R1_001 _3_MED1_IP_S27_L007_R1_001 T_thermophila_June2014_assembly.fasta T_thermophila_June2014.gff3 /tmp/ 16 +# time PATHtoRACSrepo/core/countPeaks.sh _1_MED1_INPUT_S25_L007_R1_001 _3_MED1_IP_S27_L007_R1_001 T_thermophila_June2014_assembly.fasta T_thermophila_June2014.gff3 /dev/shm/ 16 +# +################################# + + +### functions ######################################### +LOChelpMsg(){ + echo "How to use this script:" + cancel; +} +# +# +LOCcheckTools(){ + BWA=$(which bwa); + [ -z $BWA ] && errMsg "BWA needs to be installed for using this pipeline!" + SAMTOOLS=$(which samtools) + [ -z $SAMTOOLS ] && errMsg "SAMTOOLS needs to be installed in the system, as it is required in the pipeline!" +} + +##### + +LOCcheckIntegrityPipeline(){ + scriptsDIR=`dirname $0` + [ -z $scriptsDIR/table.sh ] && errMsg "missing 'table.sh' from pipeline directory $scriptsDIR" + [ -z $scriptsDIR/comb_tables.sh ] && errMsg "mssing 'comb_tables.sh' from pipeline directory $scriptsDIR" +} + +# +####################################################### +# setting preamble, detecting scripts location +scriptsDIR=`dirname $0` + +# load auxiliary fns for integrity checks and message/error handling +if [[ -f $scriptsDIR/auxs/auxFns.sh ]]; then + . $scriptsDIR/auxs/auxFns.sh --source-only; +else + echo "Error auxiliary file: " $scriptsDIR/auxs/auxFns.sh "NOT found!" + exit +fi +####################################################### + +#### CHECKS ########################################### +welcome + +### CHECK arguments +if [[ $# -eq 0 ]]; then + errMsg "No arguments supplied!"; +fi +# +if [[ $# -lt 5 ]]; then + errMsg "Five mandatory arguments are needed!"; +fi + +### CHECK software +checkIntegrityPipeline table.sh comb_tables.sh +checkTools bwa samtools +###################################################### + +# ==================================================== +# DEFINE SOME env.VARIABLES +#dataDIR=DATA +#dataDIR=data2 +#[ -d $1 ] && dataDIR=$1 || errMsg "1st argument should be a directory, where the reference data to be used is located" + +## reference file +#fastaFILE=$dataDIR/T_thermophila_June2014_assembly.fasta # <==== this could be another argument!!! +# +#faiFILE=$fastaFILE.fai + +## target files +## TO BE READ as arguments... $1 (INPUT) and $2 (IP) +[ -s $1 ] && INPUTfile=`basename $1` || errMsg "2nd argument should be a file with the INPUT reads" +[ -s $2 ] && IPfile=`basename $2` || errMsg "3rd argument should be a file with the IP reads" +[ -s $3 ] && FASTAfile=`basename $3` || errMsg "4th argument should be a file with the reference genome file (fasta)" +[ -s $4 ] && REFfile=`basename $4` || errMsg "5th argument should be an annotation file (gff3)" + +faiFILE=$FASTAfile +fastaFILE=$FASTAfile + +inputFILE1=INPUTfile +inputFILE2=IPfile + +# could be RAMdisk or TEMP in a SSD +# use RAMDISK +#RAMdisk=/dev/shm/$USER/ +#RAMdisk=/tmp/$USER/ +RunTimeDir=ORF_RACS-$$-`date '+%Y%m%d-%H%M%S'` +[ -d $5 ] && WORKINGdir=$5/$USER/$RunTimeDir || errMsg "6th argument must be a directory, we suggest using /dev/shm for RAMdisk or /tmp in a SSD device, as this pipeline is quite I/O intense!" +echo "Working directory --> " $WORKINGdir + + +# number of threads to use with BWA +cores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l) +#[ -z $6 ] && NT=$6 || NT=$cores +if [ " "$6 != " " ]; then NT=$6 ; else NT=$cores; fi +echo "BWA will use nbrCores="$NT", when possible" + +## ===== + +#inputFILE1=$INPUTfile #.fastq.gz +#[ -s $inputFILE1 ] || errMsg $inputFILE1" not found" +#inputFILE2=$IPfile #.fastq.gz +#[ -s $inputFILE2 ] || errMsg $inputFILE2" not found" + +# save original directory +myDIR=$(pwd) +resultsDIR=$myDIR/ORF_RACS_results-`date '+%Y%m%d-%H%M%S'` #`date +%D-%T` +scriptsDIR=`dirname $0` + + +# use RAMDISK +#RAMdisk=/dev/shm/$USER/ +#RAMdisk=/tmp/$USER/ + +echo # create local space on RAMDISK +mkdir -p $WORKINGdir +# copy data into WORKINGdir +#cp -pr ./DATA $RAMdisk +cp -prv $1 $WORKINGdir/$inputFILE1 +cp -prv $2 $WORKINGdir/$inputFILE2 +cp -prv $3 $WORKINGdir/$FASTAfile +cp -prv $4 $WORKINGdir/$REFfile +# move to WORKINGdir +cd $WORKINGdir + + +# outputs +outputINPUTbam=aln$INPUTfile.bam +outputINPUTbamSORTED=aln$INPUTfile-sorted.bam +outputINPUTsam=aln$INPUTfile.sam +#outputIPbam=alnBD1_IP.bam +#outputIPbamSORTED=alnBD1_IP-sorted.bam +#outputIPsam=alnBD1_IP.sam +outputIPbam=aln$IPfile.bam +outputIPbamSORTED=aln$IPfile-sorted.bam +outputIPsam=aln$IPfile.sam +#output3gzip=/dev/shm/$USER/alnBD1_INPUT.tar.gz + +# tables files +tableINPUTs=tableReadsINPUT.`basename $INPUTfile .fastq.gz` +tableIPs=tableReadsIP.`basename $IPfile .fastq.gz` + +####===================================================== + + +## pipeline +# index the assembly +# generates 4 aux. files (in principle could be removed! -- *.fasta.*) +$BWA index $fastaFILE +#rm *.fasta.* + +# align the INPUT to assembly +# bwa -> multithreaded... (upto 4) +$BWA aln -t $NT $fastaFILE $inputFILE1 > $outputINPUTbam + +# convert to sequence coordinate for INPUT +$BWA samse $fastaFILE $outputINPUTbam $inputFILE1 > $outputINPUTsam + +# remove inputFILE1 in case of memory issues +#rm -v $inputFILE1 + +####### +# zip file in RAMDISK +#tar cvzf $output3gzip $output2sam +###### + +# align the IP file to the assembly +# bwa -> multithreaded... (upto 4) +$BWA aln -t $NT $fastaFILE $inputFILE2 > $outputIPbam + +# convert to sequence coordinate for IP +$BWA samse $fastaFILE $outputIPbam $inputFILE2 > $outputIPsam + +# remove inputFILE2 in case of oom +#rm -v $inputFILE2 + + +# FAI index to the assembly +$SAMTOOLS faidx $fastaFILE + + +# align assembly witht the sam and bam files +$SAMTOOLS import $faiFILE $outputINPUTsam $outputINPUTbam +$SAMTOOLS import $faiFILE $outputIPsam $outputIPbam + +# sorting for IGV +$SAMTOOLS sort $outputINPUTbam -o $outputINPUTbamSORTED +$SAMTOOLS sort $outputIPbam -o $outputIPbamSORTED + +$SAMTOOLS index $outputINPUTbamSORTED +$SAMTOOLS index $outputIPbamSORTED + + +# define table file for extracting genes... passed as third argument to the script!!! +table=table.$REFfile +# extract the reads +touch $tableINPUTs #tableReadsINPUT +# read scaffolds from table generated by the "table" script and loop over the sorted BAM files +# NEEDS the file "table" generated by the script 'table.sh' // if it doesn't exist, launch its creation in parallel +### [ ! -e $resultsDIR/$table ] || $scriptsDIR/table.sh $REFfile && cp -v $resultsDIR/$table . +$scriptsDIR/table.sh $REFfile +[ $? -ne 0 ] && errMsg "' $scriptsDIR/table.sh $REFfile' FAILED...! exitcode $?" + +for i in `awk '{print $1}' < $table `; do + #echo $i; + $SAMTOOLS view $outputINPUTbamSORTED $i | wc -l >> $tableINPUTs #tableReadsINPUT +done + +touch $tableIPs #tableReadsIP +for i in `awk '{print $1}' < $table `; do + #echo $i; + $SAMTOOLS view $outputIPbamSORTED $i | wc -l >> $tableIPs #tableReadsIP +done + + +#remove temporal files... +rm -v *.fasta.* + +### copy data into original directory +mkdir -p $resultsDIR +cp $outputINPUTbamSORTED $outputIPbamSORTED $resultsDIR +cp $outputINPUTsam $outputIPsam $resultsDIR +cp $tableINPUTs $resultsDIR +cp $tableIPs $resultsDIR +# need the index files for intergenic regions determination +cp *bai $resultsDIR + +# FINAL step, combine tables together... +#. $scriptsDIR/comb_tables.sh _2_MED2_INPUT_S26_L007_R1_001 _4_MED2_IP_S28_L007_R1_001 T_thermophila_June2014.gff3 +$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile +[ $? -ne 0 ] && errMsg "'$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile' FAILED...! exitcode $?" +cp -v FINAL.table.* $resultsDIR + +# clean temp dir used +rm -rfv $WORKINGdir diff --git a/core/countReads.sh b/core/countReads.sh new file mode 100755 index 0000000..8a9b60d --- /dev/null +++ b/core/countReads.sh @@ -0,0 +1,242 @@ +#!/bin/bash +# -vx + +# RACS ORF pipeline +# main script used to analize and call (count) peaks in the T.Thermophila data. +# This script also calls 2 other scripts: "table.sh" and "comb_tables.sh" +# The script requires BWA and samtools to be installed as they are used within the pipeline. + +################# +### how to use it +# +# 1st argument: file with INPUT reads (.fastq.gz) +# 2nd argument: file with IP reads (.fastq.gz) +# 3rd argument: reference genome file (.fasta) +# 4th argument: annotiation file (.gff3) +# 5th argument: working space (if possible use RAMdisk --ie. /dev/shm/--, or /tmp in a computer with SSD) +# 6th argument (optional): number of cores to use for BWA multi-threading +# +# Examples: +# time PATHtoRACSrepo/core/countPeaks.sh _1_MED1_INPUT_S25_L007_R1_001.fastq.gz _3_MED1_IP_S27_L007_R1_001.fastq.gz T_thermophila_June2014_assembly.fasta T_thermophila_June2014.gff3 /tmp/ 16 +# time PATHtoRACSrepo/core/countPeaks.sh _1_MED1_INPUT_S25_L007_R1_001.fastq.gz _3_MED1_IP_S27_L007_R1_001.fastq.gz T_thermophila_June2014_assembly.fasta T_thermophila_June2014.gff3 /dev/shm/ 16 +# +################################# + + +####################################################### +# check that the script is not being sourced!!! +if [[ $0 == '-bash' ]]; then + echo "Please do not 'source' this script; ie. run it as PATHtoRACS/core/SCRIPTname arguments" + stop +fi + +# setting preamble, detecting scripts location +scriptsDIR=`dirname $0` + +# load auxiliary fns for integrity checks and message/error handling +if [[ -f $scriptsDIR/auxs/auxFns.sh ]]; then + . $scriptsDIR/auxs/auxFns.sh --source-only; +else + echo "Error auxiliary file: " $scriptsDIR/auxs/auxFns.sh "NOT found!" + exit +fi +####################################################### + +#### CHECKS ########################################### +welcome + +### CHECK arguments +if [[ $# -eq 0 ]]; then + errMsg "No arguments supplied!"; +fi +# +if [[ $# -lt 5 ]]; then + errMsg "Five mandatory arguments are needed!"; +fi + +### CHECK software +checkIntegrityPipeline table.sh comb_tables.sh +checkTools bwa samtools +###################################################### +# define generic names for required programs... +BWA=$(which bwa) +SAMTOOLS=$(which samtools) + +# ==================================================== +# DEFINE SOME env.VARIABLES +#dataDIR=DATA +#dataDIR=data2 +#[ -d $1 ] && dataDIR=$1 || errMsg "1st argument should be a directory, where the reference data to be used is located" + +## reference file +#fastaFILE=$dataDIR/T_thermophila_June2014_assembly.fasta # <==== this could be another argument!!! +# +#faiFILE=$fastaFILE.fai + +## target files +## TO BE READ as arguments... $1 (INPUT) and $2 (IP) +[ -s $1 ] && INPUTfile=`basename $1` || errMsg "2nd argument should be a file with the INPUT reads (fastq.gz)" +[ -s $2 ] && IPfile=`basename $2` || errMsg "3rd argument should be a file with the IP reads (fastq.gz)" +[ -s $3 ] && FASTAfile=`basename $3` || errMsg "4th argument should be a file with the reference genome file (fasta)" +[ -s $4 ] && REFfile=`basename $4` || errMsg "5th argument should be an annotation file (gff3)" + +faiFILE=$FASTAfile +fastaFILE=$FASTAfile + +inputFILE1=INPUTfile +inputFILE2=IPfile + +# could be RAMdisk or TEMP in a SSD +# use RAMDISK +#RAMdisk=/dev/shm/$USER/ +#RAMdisk=/tmp/$USER/ +RunTimeDir=ORF_RACS-$$-`date '+%Y%m%d-%H%M%S'` +[ -d $5 ] && WORKINGdir=$5/$USER/$RunTimeDir || errMsg "6th argument must be a directory, we suggest using /dev/shm for RAMdisk or /tmp in a SSD device, as this pipeline is quite I/O intense!" +echo "Working directory --> " $WORKINGdir + + +# number of threads to use with BWA +#cores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l) +cores=`detectCores` +#[ -z $6 ] && NT=$6 || NT=$cores +if [ " "$6 != " " ]; then NT=$6 ; else NT=$cores; fi +echo "BWA will use nbrCores="$NT", when possible" + +## ===== + +#inputFILE1=$INPUTfile #.fastq.gz +#[ -s $inputFILE1 ] || errMsg $inputFILE1" not found" +#inputFILE2=$IPfile #.fastq.gz +#[ -s $inputFILE2 ] || errMsg $inputFILE2" not found" + +# save original directory +myDIR=$(pwd) +resultsDIR=$myDIR/ORF_RACS_results-`date '+%Y%m%d-%H%M%S'` #`date +%D-%T` +scriptsDIR=`dirname $0` + + +# use RAMDISK +#RAMdisk=/dev/shm/$USER/ +#RAMdisk=/tmp/$USER/ + +echo # create local space on RAMDISK +mkdir -p $WORKINGdir +# copy data into WORKINGdir +#cp -pr ./DATA $RAMdisk +cp -prv $1 $WORKINGdir/$inputFILE1 +cp -prv $2 $WORKINGdir/$inputFILE2 +cp -prv $3 $WORKINGdir/$FASTAfile +cp -prv $4 $WORKINGdir/$REFfile +# move to WORKINGdir +cd $WORKINGdir + + +# outputs +outputINPUTbam=aln$INPUTfile.bam +outputINPUTbamSORTED=aln$INPUTfile-sorted.bam +outputINPUTsam=aln$INPUTfile.sam +#outputIPbam=alnBD1_IP.bam +#outputIPbamSORTED=alnBD1_IP-sorted.bam +#outputIPsam=alnBD1_IP.sam +outputIPbam=aln$IPfile.bam +outputIPbamSORTED=aln$IPfile-sorted.bam +outputIPsam=aln$IPfile.sam +#output3gzip=/dev/shm/$USER/alnBD1_INPUT.tar.gz + +# tables files +tableINPUTs=tableReadsINPUT.`basename $INPUTfile .fastq.gz` +tableIPs=tableReadsIP.`basename $IPfile .fastq.gz` + +####===================================================== + + +## pipeline +# index the assembly +# generates 4 aux. files (in principle could be removed! -- *.fasta.*) +$BWA index $fastaFILE +#rm *.fasta.* + +# align the INPUT to assembly +# bwa -> multithreaded... (upto 4) +$BWA aln -t $NT $fastaFILE $inputFILE1 > $outputINPUTbam + +# convert to sequence coordinate for INPUT +$BWA samse $fastaFILE $outputINPUTbam $inputFILE1 > $outputINPUTsam + +# remove inputFILE1 in case of memory issues +#rm -v $inputFILE1 + +####### +# zip file in RAMDISK +#tar cvzf $output3gzip $output2sam +###### + +# align the IP file to the assembly +# bwa -> multithreaded... (upto 4) +$BWA aln -t $NT $fastaFILE $inputFILE2 > $outputIPbam + +# convert to sequence coordinate for IP +$BWA samse $fastaFILE $outputIPbam $inputFILE2 > $outputIPsam + +# remove inputFILE2 in case of oom +#rm -v $inputFILE2 + + +# FAI index to the assembly +$SAMTOOLS faidx $fastaFILE + + +# align assembly witht the sam and bam files +$SAMTOOLS import $faiFILE $outputINPUTsam $outputINPUTbam +$SAMTOOLS import $faiFILE $outputIPsam $outputIPbam + +# sorting for IGV +$SAMTOOLS sort $outputINPUTbam -o $outputINPUTbamSORTED +$SAMTOOLS sort $outputIPbam -o $outputIPbamSORTED + +$SAMTOOLS index $outputINPUTbamSORTED +$SAMTOOLS index $outputIPbamSORTED + + +# define table file for extracting genes... passed as third argument to the script!!! +table=table.$REFfile +# extract the reads +touch $tableINPUTs #tableReadsINPUT +# read scaffolds from table generated by the "table" script and loop over the sorted BAM files +# NEEDS the file "table" generated by the script 'table.sh' // if it doesn't exist, launch its creation in parallel +### [ ! -e $resultsDIR/$table ] || $scriptsDIR/table.sh $REFfile && cp -v $resultsDIR/$table . +$scriptsDIR/table.sh $REFfile +[ $? -ne 0 ] && errMsg "' $scriptsDIR/table.sh $REFfile' FAILED...! exitcode $?" + +for i in `awk '{print $1}' < $table `; do + #echo $i; + $SAMTOOLS view $outputINPUTbamSORTED $i | wc -l >> $tableINPUTs #tableReadsINPUT +done + +touch $tableIPs #tableReadsIP +for i in `awk '{print $1}' < $table `; do + #echo $i; + $SAMTOOLS view $outputIPbamSORTED $i | wc -l >> $tableIPs #tableReadsIP +done + + +#remove temporal files... +rm -v *.fasta.* + +### copy data into original directory +mkdir -p $resultsDIR +cp $outputINPUTbamSORTED $outputIPbamSORTED $resultsDIR +cp $outputINPUTsam $outputIPsam $resultsDIR +cp $tableINPUTs $resultsDIR +cp $tableIPs $resultsDIR +# need the index files for intergenic regions determination +cp *bai $resultsDIR + +# FINAL step, combine tables together... +#. $scriptsDIR/comb_tables.sh _2_MED2_INPUT_S26_L007_R1_001 _4_MED2_IP_S28_L007_R1_001 T_thermophila_June2014.gff3 +$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile +[ $? -ne 0 ] && errMsg "'$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile' FAILED...! exitcode $?" +cp -v FINAL.table.* $resultsDIR + +# clean temp dir used +rm -rfv $WORKINGdir diff --git a/core/intergenic/det-interGenes-ORIG.sh b/core/intergenic/det-interGenes-ORIG.sh new file mode 100755 index 0000000..12d6123 --- /dev/null +++ b/core/intergenic/det-interGenes-ORIG.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# det-interGenes.sh +# main script in charge of determining the intergenic regions +# this part of the pipeline requires the table generated by the ORF reads/counts +# +# +# HOW TO USE THIS SCRIPT: +# arg1: final combined table gnerated by the ORF from the RACS pipeline +# arg2: reference genome file (gff3) +# arg3: name of the file where to save the InterGenic Regions +# arg4/5: name of the output file from ORF part/tag.(ref.) name +# arg6/7: ... +# ... +# +# eg. +# det-interGenes.sh combinedTABLES_MED1-MED2 dataset/T_thermophila_June2014.sorted.gff3 interGENs_MED1-MED2.csv +# Fillingham_1_MED1_INPUT_S25_L007_R1_001 INPUTm1 \ +# Fillingham_2_MED2_INPUT_S26_L007_R1_001 INPUTm2 \ +# Fillingham_3_MED1_IP_S27_L007_R1_001 IPm1 \ +# Fillingham_4_MED2_IP_S28_L007_R1_001 IPm2 \ +# ... +# + +################################################# +# Setting preamble, detecting scripts location +scriptsDIR=`dirname $0` + +################################################# +# load auxiliary fns for integrity checks and message/error handling +. $scriptsDIR/../auxs/auxFns.sh --source-only + +#### CHECKS ##################################### +### CHECK arguments +if [[ $# -eq 0 ]]; then + errMsg "No arguments were supplied!"; +fi +# + +### INTEGRITY CHECK +# check RACS scripts +checkIntegrityPipeline intergenic/det-interGenes.sh intergenic/interGeneRegions.R intergenic/utils_RACS-IGR.R intergenic/interGenes.sh +# check external tools needed +checkTools samtools Rscript +################################################# + +# getting command line arguments + +combTABLE=$1 #combinedTABLES_MED1-MED2 +refFILE=$2 #dataset/T_thermophila_June2014.sorted.gff3 +interGENregions=$3 #interGENs_MED1-MED2.csv -- output from Rscript +# +INPUTfile1=$4 #Fillingham_1_MED1_INPUT_S25_L007_R1_001 +tag1=$5 #INPUTm1 +INPUTfile2=$6 #Fillingham_2_MED2_INPUT_S26_L007_R1_001 +tag2=$7 #INPUTm2 +INPUTfile3=$8 #Fillingham_3_MED1_IP_S27_L007_R1_001 +tag3=$9 #IPm1 +INPUTfile4=${10} #Fillingham_4_MED2_IP_S28_L007_R1_001 +tag4=${11} #IPm2 +# +combinedFinalIGenicTable=${interGENregions}_INPUTs-IPs`date '+%Y%m%d-%H%M%S'`.csv +# ================================================ + +# step #0: look into combTABLE and generate list of intergenic regions +t0=$(time Rscript $scriptsDIR/interGeneRegions.R $combTABLE $refFILE $interGENregions) + +# process each of the samples in comparison to the intergenic regions just generated in 'step #0' +t1=$(time $scriptsDIR/interGenes.sh $tag1 $INPUTfile1 $interGENregions) +#t1=$(time . $scriptsDIR/interGenes.sh INPUTm1 Fillingham_1_MED1_INPUT_S25_L007_R1_001 interGENs_MED1-MED2.csv ) +t2=$(time $scriptsDIR/interGenes.sh $tag2 $INPUTfile2 $interGENregions) +#t2=$(time . $scriptsDIR/interGenes.sh INPUTm2 Fillingham_2_MED2_INPUT_S26_L007_R1_001 interGENs_MED1-MED2.csv ) +t3=$(time $scriptsDIR/interGenes.sh $tag3 $INPUTfile3 $interGENregions) +#t3=$(time . $scriptsDIR/interGenes.sh IPm1 Fillingham_3_MED1_IP_S27_L007_R1_001 interGENs_MED1-MED2.csv ) +t4=$(time $scriptsDIR/interGenes.sh $tag4 $INPUTfile4 $interGENregions) +#t4=$(time . $scriptsDIR/interGenes.sh IPm2 Fillingham_4_MED2_IP_S28_L007_R1_001 interGENs_MED1-MED2.csv ) + +# combine tables with INPUT/IP reads +#t5=$(time paste interGENs_MED1-MED2.csv interGENs-Fillingham_1_MED1_INPUT_S25_L007_R1_001 interGENs-Fillingham_3_MED1_IP_S27_L007_R1_001 interGENs-Fillingham_2_MED2_INPUT_S26_L007_R1_001 interGENs-Fillingham_4_MED2_IP_S28_L007_R1_001 > interGENs_MED1-MED2_INPUTs-IPs.csv ) +t5=$(time paste $interGENregions interGENs-$INPUTfile1 interGENs-$INPUTfile3 interGENs-$INPUTfile2 interGENs-$INPUTfile4 > $combinedFinalIGenicTable) + +# statistics on running times +echo "Partial Times: $t0, $t1, $t2, $t3, $t4" +echo Total time: $t0+($t1+$t2+$t3+$t4)+$t5 + + diff --git a/core/intergenic/det-interGenes.sh b/core/intergenic/det-interGenes.sh new file mode 100755 index 0000000..1703f59 --- /dev/null +++ b/core/intergenic/det-interGenes.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# det-interGenes.sh --- RACS IGR pipeline +# main script in charge of determining the intergenic regions +# this part of the pipeline requires the table generated by the ORF reads/counts +# +# +# HOW TO USE THIS SCRIPT: +# arg1: final combined table gnerated by the ORF from the RACS pipeline +# arg2: reference genome file (gff3) +# arg3: name of the file where to save the InterGenic Regions +# arg4: text file containg the name of the output file from ORF part/tag.(ref.) name; eg. +# alnGreenblatt_1_SSCH1-1_ChIP_S81_R1_001.fastq.gz-sorted.bam +# alnGreenblatt_1_SSCH1-1_ChInput_S82_R1_001.fastq.gz-sorted.bam +# +# The aln*.fastq.g-sorted.bam files should have been generated with the ORF part of the RACS pipeline! +# +# eg. +# PATHtoRACSrepo/core/intergenic/det-interGenes.sh combinedTABLES_MED1-MED2 dataset/T_thermophila_June2014.sorted.gff3 interGENs_MED1-MED2.csv samples.file +# + +################################################# + +# Setting preamble, detecting scripts location +scriptsDIR=`dirname $0` + +################################################# +# load auxiliary fns for integrity checks and message/error handling +if [[ -f $scriptsDIR/../auxs/auxFns.sh ]]; then + . $scriptsDIR/../auxs/auxFns.sh --source-only; +else + echo "Error auxiliary file: " $scriptsDIR/../auxs/auxFns.sh "NOT found!" + exit +fi + +# display RACS welcome/credit message +welcome + + +#### CHECKS ##################################### +### INTEGRITY CHECKs +# check RACS scripts for the IGR routines +checkIntegrityPipeline det-interGenes.sh \ + interGeneRegions.R utils_RACS-IGR.R \ + interGenes.sh + +# check external tools needed +checkTools samtools Rscript + +################################################ + +### CHECK arguments +if [[ $# -eq 0 ]]; then + errMsg "No arguments were supplied!"; +fi +# +case $# in + 4) echo $# "Arguments received:"; echo $@;; + *) usage ;; +esac + +########### + +# getting command line arguments +# MANDATORY arguments +combTABLE=$1 #combinedTABLES_MED1-MED2 +refFILE=$2 #dataset/T_thermophila_June2014.sorted.gff3 +interGENregions=$3 #interGENs_MED1-MED2.csv -- output from Rscript +SAMPLES=$4 # input.samples: text file containing the name of the sample files to process, eg. + # alnGreenblatt_1_SSCH1-1_ChIP_S81_R1_001.fastq.gz-sorted.bam + # + # alnFillingham_1_MED1_INPUT_S25_L007_R1_001 + # alnFillingham_2_MED2_INPUT_S26_L007_R1_001 + # alnFillingham_3_MED1_IP_S27_L007_R1_001 + # alnFillingham_4_MED2_IP_S28_L007_R1_001 +# + +# check inut arguments, ie. that the files indicated actually exist! +checkFile $combTABLE $refFILE $SAMPLES + + +combinedFinalIGenicTable=${interGENregions}_INPUTs-IPs`date '+%Y%m%d-%H%M%S'`.csv +# ================================================ + +# step #0: look into combTABLE and generate list of intergenic regions +t0=$(time Rscript $scriptsDIR/interGeneRegions.R $combTABLE $refFILE $interGENregions) + +Ttot=$((t0)) +files=$(cat $SAMPLES) +echo "TARGETS: " $files +for file in $files; do + echo 'processing sample file:' $file + # process each of the samples in comparison to the intergenic regions just generated in 'step #0' + t1=$(time $scriptsDIR/interGenes.sh $file-`date '+%Y%m%d-%H%M%S'` $file $interGENregions) + # Add individual times... + #Ttot=$((Ttot+t1)) +done + + +# combine tables with INPUT/IP reads +#t5=$(time paste interGENs_MED1-MED2.csv interGENs-Fillingham_1_MED1_INPUT_S25_L007_R1_001 interGENs-Fillingham_3_MED1_IP_S27_L007_R1_001 interGENs-Fillingham_2_MED2_INPUT_S26_L007_R1_001 interGENs-Fillingham_4_MED2_IP_S28_L007_R1_001 > interGENs_MED1-MED2_INPUTs-IPs.csv ) +###>>>>>> +t5=$(time paste $interGENregions `cat $SAMPLES | awk '{print "interGENs-"$1".csv"}'` > $combinedFinalIGenicTable) + +# statistics on running times +echo Total time: $((Ttot+t5)) + +########################################################################### diff --git a/core/intergenic/interGeneRegions.R b/core/intergenic/interGeneRegions.R new file mode 100755 index 0000000..95cdff0 --- /dev/null +++ b/core/intergenic/interGeneRegions.R @@ -0,0 +1,176 @@ +######################################################################### +# RACS: InterGenic Regions -- main driver R script +# +# requires "utils_RACS-IGR.R" utilties file with fn defns +# +# part of the det-intergenic.sh pipeline +# used for determing the intergenic regions +# +# +# HOW TO USE this script: +# +# Rscript interGeneRegions.R inputFile refFILE.gff3 outFile +# +# +######################################################################## + +# +# Function to establish the location of the script... hence the relative path to the utilities file +# this function needs to be in the main driver script! +# + +ALLargs <- function() { + cmdArgs <- commandArgs(trailingOnly = FALSE) + scrPATHkw <- "--file=" + match <- grep(scrPATHkw, cmdArgs) + if (length(match) > 0) { + # Rscript + scriptLOC <- dirname(normalizePath(sub(scrPATHkw, "", cmdArgs[match]))) + } else { + # 'source'd via R console + scriptLOC <- normalizePath(sys.frames()[[1]]$ofile) + } + + cmdArgs <- commandArgs(trailingOnly =TRUE) + return(list(scriptLOC,cmdArgs)) +} + + + +allArgs <- ALLargs() + +print(allArgs) + +######################################################################## +######################################################################## +cat(" RACS v1.0 (2018/2019) -- InterGenic Regions Finder",'\n') +cat("----------------------------------------------------",'\n') +####################################################################### +######################################################################## +# load utilities file with fns. defns. + +utilitiesFile <- paste(allArgs[[1]],"/utils_RACS-IGR.R",sep='') +print(utilitiesFile) +#print(dirname(sys.frame(1)$ofile) +#sourceDir <- getSrcDirectory(function(dummy) {dummy}) +#print(sourceDir) + + +if (file.exists(utilitiesFile)) { + source(utilitiesFile) +} else { + stop("Critical ERRROR: '",utilitiesFile,"' NOT found!") +} + +######################################################################## + +################################ + + +# read files... +files2process <- CLArgs() + +inputDATA <- readDATA(files2process[1]) + + +refTable <- readRefTable(files2process[2]) + + +sortedDATA <- reshapeTable(inputDATA) + +nbrEntries <- length(sortedDATA$lscaffold) + + + +#for (i in c(1:nbrEntries)) { +# #a <- sortedDATA[i,][1] #levels(sortedDATA[i,]$lscaffold)[i] +# a <- levels(sortedDATA$lscaffold)[sortedDATA$lscaffold][i] +# aa <- sortedDATA[i,]$lbregion +# aaa <- sortedDATA[i,]$leregion +# cat(a,aa,aaa,'\n') +#} + +lstscfld <- c() +lstregion1 <- c() +lstregion2 <- c() +lstSize <- c() +#nbrEntries <- 20 + +SCFLD <- "" +for (i in c(1:nbrEntries)) { + scaffold <- levels(sortedDATA$lscaffold)[sortedDATA$lscaffold][i] + begReg <- sortedDATA[i,]$lbregion + endReg <- sortedDATA[i,]$leregion + cat(i, scaffold,begReg,endReg,'\n') + if (SCFLD == scaffold) { #same scaffold + #if ((lstscfld[length(lstscfld)]==scaffold)&(begReg<lstregion2[length(lstscfld)])) { +# if (begReg>lstregion1[try(length(lstscfld))]) { +# if (begReg>endReg) { + if (begReg > sortedDATA[i-1,]$leregion) { + beginRegion <- begReg + #dumpData(SCFLD,endRegion,beginRegion, lstscfld,lstregion1,lstregion2,lstSize, 1) + myScfld <- dumpData(SCFLD,endRegion,beginRegion, 1) + lstscfld <- c(lstscfld, myScfld[1]) + lstSize <- c(lstSize, myScfld[4]) + lstregion1 <- c(lstregion1,endRegion+1) + lstregion2 <- c(lstregion2,beginRegion-1) + endRegion <- endReg + } else { # will to skip this region... + print(">>>>>>>>>>>>>>>> SUSPICIOUS OVERLYING REGIONs!!! <<<<<<<<<<<<<<<<") + cat(scaffold,begReg,endReg,'\n') + cat(try(lstscfld[length(lstscfld)-1]),try(lstregion1[length(lstscfld)-1]),try(lstregion2[length(lstscfld)-1]),'\n') + #stop + } + } else { # change scaffold + if (SCFLD != "") { + beginRegion <- begReg + #dumpData(SCFLD,endRegion,'xxx', lstscfld,lstregion1,lstregion2,lstSize, 1) + scfCAP <- scfldCAP(SCFLD,refTable) + if (endRegion < scfCAP ) { # dealing with cases where the gene ends at the limit of the scaffold... + myScfld <- dumpData(SCFLD,endRegion,scfCAP+1, 1) + lstscfld <- c(lstscfld, myScfld[1]) + lstSize <- c(lstSize, myScfld[4]) + lstregion1 <- c(lstregion1,endRegion+1) + lstregion2 <- c(lstregion2,scfCAP) + } else { + print(">>>>>>>>>>>>>>>> SUSPICIOUS OVERLYING REGIONs!!! <<<<<<<<<<<<<<<<") + cat(scaffold,begReg,endReg,'\n') + cat(try(lstscfld[length(lstscfld)-1]),try(lstregion1[length(lstscfld)-1]),try(lstregion2[length(lstscfld)-1]),'\n') + #stop + } + #dumpData(scaffold,'0',beginRegion, lstscfld,lstregion1,lstregion2,lstSize, 1) + myScfld <- dumpData(scaffold,'0',beginRegion, 1) + lstscfld <- c(lstscfld, myScfld[1]) + lstSize <- c(lstSize, myScfld[4]) + lstregion1 <- c(lstregion1,0+1) + lstregion2 <- c(lstregion2,beginRegion-1) + endRegion <- endReg + } else { # first scaffold ever ... + beginRegion <- begReg #strsplit(region,'-')[[1]][1] + endRegion <- endReg #strsplit(region,'-')[[1]][2] + #dumpData(scaffold,'0',beginRegion, lstscfld,lstregion1,lstregion2,lstSize, 1) + myScfld <- dumpData(scaffold,'0',beginRegion, 1) + lstscfld <- c(lstscfld, myScfld[1]) + lstSize <- c(lstSize, myScfld[4]) + lstregion1 <- c(lstregion1,0+1) + lstregion2 <- c(lstregion2,beginRegion-1) + } + SCFLD <- scaffold + } +} +# take care of last case... +#dumpData(scaffold,endRegion,'xxx', lstscfld,lstregion1,lstregion2,lstSize, 1) + scfCAP <- scfldCAP(scaffold,refTable) + myScfld <- dumpData(scaffold,endRegion,scfCAP+1, 1) + lstscfld <- c(lstscfld, myScfld[1]) + lstSize <- c(lstSize, myScfld[4]) + lstregion1 <- c(lstregion1,endRegion+1) + lstregion2 <- c(lstregion2,scfCAP) + +interGenes <- data.frame(lstscfld,lstregion1,lstregion2,lstSize) +names(interGenes) <- c("scaffold",'beggining','end','size') +saveDATA(interGenes,files2process[3]) + + + + diff --git a/core/intergenic/interGenes.sh b/core/intergenic/interGenes.sh new file mode 100755 index 0000000..9edc799 --- /dev/null +++ b/core/intergenic/interGenes.sh @@ -0,0 +1,73 @@ +# interGenes.sh --- RACS pipeline +# +# shell script part of the RACS InterGenic Regions determination +# this script is part of the IGR RACS tool, use "det-interGenes.sh" instead! +# this script should receive 3 arguments +# +# arg1: a label to annotate the produced files +# arg2: alnXXXX.fastqc.gz-sorted.bam file produced when running the ORF RACS pipeline tool +# arg3: table contanining the IGR +# +# Eg. call the script with BDx_{INPUT/IP}_NSG interGENs.csv +## . ../newSCRIPTS/interGenes.sh INPUTm1 alnFillingham_1_MED1_INPUT_S26_L007_R1_001.fastq.gz-sorted.bam interGENs_MED1-MED2.csv +# +################################################## +# setting preamble, detecting scripts location +scriptsDIR=`dirname $0` + + +# load auxiliary fns for integrity checks and message/error handling +if [[ -f $scriptsDIR/../auxs/auxFns.sh ]]; then + . $scriptsDIR/../auxs/auxFns.sh --source-only; +else + echo "Error auxiliary file: " $scriptsDIR/../auxs/auxFns.sh "NOT found!" + exit +fi +################################################# + +# display RACS welcome message +welcome + +#### CHECKS +### INTEGRITY CHECKs +# check external tools needed +checkTools samtools + +################################################# + +echo 'hello' +echo $# +echo $@ + +### CHECK arguments +case $# in + 3) echo $# "Arguments received:"; echo $@;; + *) errMsg "Invalid number of arguments!"; usage ;; +esac + +######### + +HEADER=$1 +src=$2 +#bamSORTEDfiles=aln$src-sorted.bam +bamSORTEDfiles=$2 +interGENregions=$3 +table_interGENs=interGENs-$src.csv #--`date '+%Y%m%d-%H%M%S'`.csv + +# check arguments: verify that input files exist... +checkFile $interGENregions $bamSORTEDfiles + +# check if there was a previous version of IGR table, and if so renamed it +# [[ -s $table_interGENs ]] && mv -v $table_interGENs $table_interGENs-PREVIOUS_`date '+%Y%m%d-%H%M%S'`.csv + +# generate header +echo $table_interGENs"::"$HEADER > $table_interGENs + + +# count reads ... +for i in `awk '{ if (NR>1) print $1}' < $interGENregions `; do + #echo $i; + samtools view $bamSORTEDfiles $i | wc -l >> $table_interGENs +done + +#################################################################### diff --git a/core/intergenic/utils_RACS-IGR.R b/core/intergenic/utils_RACS-IGR.R new file mode 100755 index 0000000..bbaf91b --- /dev/null +++ b/core/intergenic/utils_RACS-IGR.R @@ -0,0 +1,225 @@ +# RACS pipeline -- interGENIC regions +# utilities file for interGeneRegions.R script + + +######################################################################### + +reshapeTable <- function(origDATA){ + +lscaffold <- c() +lbregion <- c() +leregion <- c() + +# reshape the table to FIX order... +for (i in origDATA$geneSCFFLD) { + lscaffold <- c(lscaffold, strsplit(i,':')[[1]][1]) + region <- strsplit(i,':')[[1]][2] + lbregion <- c(lbregion, as.numeric(strsplit(region,'-')[[1]][1])) + leregion <- c(leregion, as.numeric(strsplit(region,'-')[[1]][2])) +} +tmpTable <- data.frame(lscaffold,lbregion,leregion) +sortedTABLE <- tmpTable[order(tmpTable$lscaffold,tmpTable$lbregion),] + +return(sortedTABLE) +} +####################################################################### + + +######################################################################### + +dumpData <- function(scaffold,region1,region2, flag) { + r1 <- as.numeric(region1)+1 + if (region2 != 'xxx') { + r2 <- as.numeric(region2)-1 + geneSz <- r2-r1+1 + } else { + r2 <- region2 + geneSz <- 'XXX' + } + + strng0 <- paste(scaffold,':',r1,'-',r2,sep='') + strng1 <- paste(strng0,'\t',geneSz,sep='') + #lstscfld <<- c(lstscfld, strng0) + #lstreg1 <<- c(lstreg1, r1) + #lstreg2 <<- c(lstreg2, r2) + #lstsize <<- c(lstsize, geneSz) + + if (flag==1) { + cat(strng1) + cat('\n') + } + + return(c(strng0,r1,r2,geneSz)) +} + +######################################################################## + + +######################################################################## +######################################################################## +######################################################################## + +# OLD intergene regions generating function +interGenesRegionGen_OLD <- function(srtedTABLE) { +SCFLD <- "" +for (i in srtedTABLE$lscaffold) { + scaffold <- i #strsplit(i,':')[[1]][1] #substr(i,1,12) + region <- strsplit(i,':')[[1]][2] + print(i) + if (SCFLD == scaffold) { #same scaffold + beginRegion <- strsplit(region,'-')[[1]][1] + #print(i) + cat(SCFLD,':',as.numeric(endRegion)+1,'-',as.numeric(beginRegion)-1,'\t',as.numeric(beginRegion)-as.numeric(endRegion)) + cat('\n') + endRegion <- strsplit(region,'-')[[1]][2] + } else { # change scaffold + if (SCFLD!='') { + #print(i) + beginRegion <- strsplit(region,'-')[[1]][1] + dumpData(SCFLD,endRegion,'xxx') + dumpData(scaffold,'0',beginRegion) + endRegion <- strsplit(region,'-')[[1]][2] + } else { # first scaffold ever... + beginRegion <- strsplit(region,'-')[[1]][1] + endRegion <- strsplit(region,'-')[[1]][2] + dumpData(scaffold,'0',beginRegion) + } + SCFLD <- scaffold + } +} +# take care of last case... +dumpData(scaffold,endRegion,'xxx') +} + +######################################################################## +######################################################################## +######################################################################## + + +######################################################################## + + + + + +######################################################################## + +# function to dump the new data generated in a CSV file +saveDATA <- function(data,fileName) { + write.table(data, file=fileName, sep='\t', row.names=FALSE, quote=FALSE) +} + +######################################################################## +# Help/Description Fn +errMsgFn <- function(...){ + + cat("RACS: intergenic region determination Rscript",'\n') + cat("----------------------------------------------------",'\n') + cat("Attention! This script requires 3 arguments!",'\n') + cat('\t'," i: input file where to read the combined tables from",'\n') + cat('\t'," ii: reference *gff3* genome file for the organism, eg. 'T_thermophila_June2014.sorted.gff3'", '\n') + cat('\t'," iii: name of the generated intergenic files, eg. 'interGENs.csv'", '\n') + cat("---------------------------------------------------",'\n\n') + cat(paste(...,sep=''),'\n') + stop() +} + + +# Function to process command line arguments +CLArgs <- function(def.ref.file="DATA/T_thermophila_June2014.sorted.gff3",def.out.name='interGENs.csv') { + + checkFile <- function(fileN) { + if (file.exists(fileN) == FALSE) { + errMsgFn("Error: '",fileN,"' NOT found!") + } + } + +# read command-line arguments, expecting three filenames: +# i) to read, ii) ref gff3 file for organism, iii) to save results + + args <- commandArgs(trailingOnly=TRUE) + + len.args <- length(args) + if (len.args < 1) errMsgFn("Error: this scripts requires 3 arguments!") + + # process 1st argument + inputFile <- args[1] # eg. "combinedTABLES_BD1-BD2--SORTED" + checkFile(inputFile) + + if (len.args == 3) { + refFile <- args[2] # eg. "DATA/T_thermophila_June2014.sorted.gff3" + checkFile(refFile) + outFile <- args[3] + } else if (len.args == 2 ) { + refFile <- args[2] + checkFile(refFile) + cat("Assuming default name for output file... '",def.out.name,"'",'\n') + outFile <- def.out.name + } else { + errMsgFn("Error: incorrect numnber of arguments!") + } + + return(c(inputFile,refFile,outFile)) +} + + +# Function for selecting the input data from a file.... +readDATA <- function(filename) { + +## read DATA +#combinedDATA=read.csv("~/Downloads/combinedTABLES_BD1-BD2--SORTED",header=F,sep='\t') +#combinedDATA=read.csv("./TABLEE",header=F,sep='\t') +#inputDATA <- read.csv(filename, header=F, sep='\t') + +# latest version of RACS-ORF will generate final version of the TABLESwith headers +inputDATA <- read.csv(filename, header=TRUE, sep='') + +# give names to the columns... +names(inputDATA)[1]<-"geneSCFFLD" +#names(combinedDATA)[3]<-"Note" +#names(combinedDATA)[4]<-"geneSize" +#names(inputDATA)[5]<-"BD1readsINPUT" +#names(inputDATA)[6]<-"BD1readsIP" +#names(combinedDATA)[7]<-"normBD1readsINPUT" +#names(combinedDATA)[8]<-"normBD1readsIP" +#names(combinedDATA)[9]<-"BD1score" +#names(inputDATA)[10]<-"BD2readsINPUT" +#names(inputDATA)[11]<-"BD2readsIP" +#names(combinedDATA)[12]<-"normBD2readsINPUT" +#names(combinedDATA)[13]<-"normBD2readsIP" +#names(combinedDATA)[14]<-"BD2score" +#names(combinedDATA)[15]<-"BD1BD2enr" + +return(inputDATA) +} +######################################################################### +readRefTable <- function(refFile) { + +refTableOrig <- read.csv(refFile, header=FALSE, sep='\t') + +refTable <- data.frame(refTableOrig[refTableOrig$V3=='supercontig',]$V1, refTableOrig[refTableOrig$V3=='supercontig',]$V5) +names(refTable) <- c("scaffold","supercontig") + +return(refTable) +} + +######################################################################### + +scfldCAP <- function(scfld,refTable) { + return(refTable[refTable$scaffold==scfld,]$supercontig) +} + +######################################################################### + +warningSCFLD <- function(scfld,begReg,endReg) { + print(">>>>>>>>>>>>>>>> SUSPICIOUS OVERLYING REGIONs!!! <<<<<<<<<<<<<<<<") + cat(scaffold,begReg,endReg,'\n') + cat(try(lstscfld[length(lstscfld)-1]),try(lstregion1[length(lstscfld)-1]),try(lstregion2[length(lstscfld)-1]),'\n') + #stop +} + +######################################################################## + + + + diff --git a/core/normalizeORF.sh b/core/normalizeORF.sh new file mode 100755 index 0000000..fe9e622 --- /dev/null +++ b/core/normalizeORF.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +# normalizeORF.sh --- RACS pipeline +# +# Script utilized to normalize the reads from the ORF +# +# +### +### How to use this script: +# +# This script requires 3 arguments: +# +# - 1st argument: "FINAL.table.*" file from RACS' ORF pipeline +# - 2nd argument: "PF-INPUT-value" PF value correspoding to the INPUT file +# - 3rd argument: "PF-IP-value" PF value correspoding to the IP file +# - 4th argument: 'A' or 'D' (OPTIONAL), when this 4th argument is specified, an additiona table is created being ordered with respect to the IP/INPUT ratio, in "A"scending or "D"ecreasing order +# +# +# Example: +# PATHtoRACS/core/normalizeORF.sh FINAL.table.XXXX 14694464 10148171 +# PATHtoRACS/core/normalizeORF.sh FINAL.table.XXXX 14694464 10148171 A +# +# +################################# + + +# setting preamble, detecting scripts location +scriptsDIR=`dirname $0` + +# load auxiliary fns for integrity checks and message/error handling +if [[ -f $scriptsDIR/auxs/auxFns.sh ]]; then + . $scriptsDIR/auxs/auxFns.sh --source-only; +else + echo "Error auxiliary file: " $scriptsDIR/auxs/auxFns.sh "NOT found!" + exit +fi + +#### CHECKS ########################################### +welcome +### CHECK arguments +if [[ $# -eq 0 ]]; then + errMsg "No arguments supplied!"; +fi +# +if [[ $# -lt 3 ]]; then + errMsg "Three mandatory arguments are needed!"; +fi +####### +[ -s $1 ] && INPUTfile=`basename $1` || errMsg "1st argument should be the FINAL.table.* file coming from the RACS' ORF pipeline" +####### + +# Check values for PF Clusters scores +checkNbr $2 "Second" +checkNbr $3 "Third" + +# Check whether the user wants to generate an additional ordered table +if [[ $# -eq 4 ]]; then + checkOption $4 "Fourth (optional)"; +elif [[ $# -gt 4 ]]; then + errMsg "This script can accept only four arguments!" +fi + + +####### + +###################################################### + +# define name of the new table with normalized entries +normTABLE=normalized--$1 + +# process the file, adding normalizations... +# first time skip first line from the FINAL.table to avoid its header... +# a second awk is used to avoid division.by.zero, will populate with -- instead +awk -v pfINPUT="$2" -v pfIP="$3" 'NR > 1 {print $0"\t"$5/pfINPUT"\t"$6/pfIP}{next}' $1 | awk '{ if ($7==0) print $0"\t""--"; else print $0"\t"($8/$7)}' > $normTABLE + +# let's sort the normalized table by the ratio... +if [[ $4 == "A" ]]; then + echo "sorting normalized table in 'A'scending order..."; + sort -k 9 $normTABLE > $normTABLE--SORTED; +elif [[ $4 == "D" ]]; then + echo "sorting normalized table in 'D'esceding order..."; + sort -k 9 -r $normTABLE > $normTABLE--SORTED; +fi + +# Add header to the new files +header=`head -1 $1`" \t normalized.INPUT \t normalized.IP \t normalized.IP.INPUT.ratio" + +if [[ $4 -eq "A" || $4 -eq "D" ]]; then + TABLES=$normTABLE--SORTED" "$normTABLE; +else + TABLES=$normTABLE; +fi + +echo $TABLES +for table in $TABLES; do + echo $table; + echo -e $header | cat - $table > tmp && mv tmp $table +done + + +####################################################################################### diff --git a/core/table.sh b/core/table.sh new file mode 100755 index 0000000..66fe719 --- /dev/null +++ b/core/table.sh @@ -0,0 +1,27 @@ +# table.sh file, part of RACS ORF pipeline +# this script is internal to the RACS ORF pipeline, and it is called from countReads.sh + +# command-line arguments +FILE=$1 +#FILE=T_thermophila_June2014.gff3 +filter1=gene +filter2="Name=TTHERM_" +#filter3='"hypothetical protein"' + +TARGET=$(grep $filter1 $FILE | grep $filter2) + +#grep $filter1 $FILE | grep $filter2 | awk '{print $1":"$4"-"$5" "$9} + +# grab scafold and genes' range +grep $filter1 $FILE | grep $filter2 | awk '{print $1" "$4"-"$5}' > tmp0 +# grab "TTHERM" +grep $filter1 $FILE | grep $filter2 | awk 'BEGIN{FS="TTHERM"} {print $2}' | awk 'BEGIN{FS=";Note"} {print "TTHERM"$1}' > tmp1 +# grab 'Note' and replace 'spaces' with 'underscores (_)' +grep $filter1 $FILE | grep $filter2 | awk 'BEGIN{FS="Note="} {print "Note="$2}' | sed 's/ /_/g' > tmp2 +# compute gene size +grep $filter1 $FILE | grep $filter2 | awk '{print $5-$4+1}' > tmp3 + +# put all together to generate table +paste tmp0 tmp1 tmp2 tmp3 | sort -k 1 | sort -t: -n -k 2 | awk '{print $1":"$2"\t"$3"\t"$4"\t"$5"\t"$6}' > table.$FILE +rm tmp0 tmp1 tmp2 tmp3 + diff --git a/core/test/lst b/core/test/lst new file mode 100644 index 0000000..511b51e --- /dev/null +++ b/core/test/lst @@ -0,0 +1,8 @@ +countReads.sh +table.sh +comb_tables.sh +intergenic/interGenes.sh +intergenic/interGeneRegions.R +intergenic/det-interGenes.sh +auxs/testFns.sh +auxs/auxFns.sh diff --git a/datasets/IBD1_Intergenic.xlsx b/datasets/IBD1_Intergenic.xlsx new file mode 100644 index 0000000..8508802 Binary files /dev/null and b/datasets/IBD1_Intergenic.xlsx differ diff --git a/datasets/IBD1_ORF.xlsx b/datasets/IBD1_ORF.xlsx new file mode 100644 index 0000000..75ea369 Binary files /dev/null and b/datasets/IBD1_ORF.xlsx differ diff --git a/datasets/PostProcessing_Genic.xlsx b/datasets/PostProcessing_Genic.xlsx new file mode 100644 index 0000000..56b58a0 Binary files /dev/null and b/datasets/PostProcessing_Genic.xlsx differ diff --git a/datasets/PostProcessing_Intergenic.xlsx b/datasets/PostProcessing_Intergenic.xlsx new file mode 100644 index 0000000..87cd531 Binary files /dev/null and b/datasets/PostProcessing_Intergenic.xlsx differ diff --git a/hpc/IGR-jobs_parallel.gnu b/hpc/IGR-jobs_parallel.gnu new file mode 100644 index 0000000..9f91788 --- /dev/null +++ b/hpc/IGR-jobs_parallel.gnu @@ -0,0 +1,53 @@ +#!/bin/bash +# +## example of a sequence of RACS IGR jobs using GNU-parallel + +##### +errMsg() { msg "$*"; helpMsg; exit 111;} +#### + +# this script requires GNU-PARALLEL to work, so let's first +# be sure that it installed in the system... +GNUPARALLEL=$(which parallel) +[ -z $GNUPARALLEL ] && errMsg "This script uses GNU-PARALLEL to launch in parallel a series of InterGenic Region processes" || echo "Using $GNUPARALLEL" + +# set location of RACS +RACSloc=$SCRATCH"/REPO/core/" +interGenic=$RACSloc"/intergenic/det-interGenes.sh" + +# select reference GFF3 file +REFgff3=$SCRATCH"/negCtrl/refs/T_thermophila_June2014.gff3" + +# location of ORF results used as input for the IGR script +ORFdirs="ORF_RACS_results-" #20181212-141459" #"ORF_RACS_results-" + + +# function to explore directories with RACS ORF results +processFiles() { + for i in $ORFdirs*; do + cd $i; #pwd; #echo FINAL.table.*1; + # generate list of sample files per directory + ls -1 *fastq.gz-sorted.bam > sample.input + ### check the actual alingment and final.table files are there... + alnFiles=$(cat sample.input | grep -o -P '(?<=aln).*(?=.fastq.gz-sorted.bam)' | sed 's/ /-/') + IGRtable="FINAL.table."$alnFiles + #echo "expected files to process..." $alnFiles $IGRtable + ### + execute=$(for IGRtable in FINAL.table*1; do \ + echo $interGenic $IGRtable $REFgff3 tableIGR--$IGRtable sample.input ; + done) + echo "echo $i && cd $i && pwd && $execute && cd .. "; + cd .. + done +} + +# specify name of the file were to record the log of the jobs +LOG=$USER-IGR_RACS-`date '+%Y%m%d-%H%M%S'`.log + +# number of tasks to run at the same time, ideally it could be equal to +#he number of processors (or twice if hyper-threading is availble) +cores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l) +PARALLEL_TASKS_PER_NODE=$cores + +# process all the jobs via GNU-PARALLEL +processFiles | $GNUPARALLEL --joblog $LOG -j $PARALLEL_TASKS_PER_NODE diff --git a/hpc/modules b/hpc/modules new file mode 100644 index 0000000..601367c --- /dev/null +++ b/hpc/modules @@ -0,0 +1,8 @@ +# Example of module loading script +# for SciNet/Niagara system -- using CCEnv and SAMTOOLS and BWA + required supporting packages nixpkgs and intel compiler +# November 2018 + +module load CCEnv +module load nixpkgs/16.09 intel/2018.3 samtools/1.9 bwa/0.7.17 + +module list diff --git a/hpc/submission.pbs b/hpc/submission.pbs new file mode 100644 index 0000000..b6ea980 --- /dev/null +++ b/hpc/submission.pbs @@ -0,0 +1,23 @@ +#!/bin/bash +# MOAB/Torque submission script for SciNet GPC +# +#PBS -l nodes=1:ppn=16,walltime=10:00:00 +#PBS -N RACS_ORF_run -q largemem + +# load modules, ie. to use, eg. SamTools and BWA +module load samtools/1.3.1 bwakit + +# DIRECTORY TO RUN - $PBS_O_WORKDIR is directory job was submitted from +cd $PBS_O_WORKDIR + +# EXECUTION COMMAND +# RACS repo located on $SCRATCH +# data subdirectory containing data files: INPUT and IP files +# refs subdirectory containing reference files: fasta and gff3 files +# the pipeline will attempt to use RAMDisk (/dev/shm) and autodetect maximum number of cores to use +($SCRATCH/RACS/core/countReads.sh \ + data/SSCH1-1_Input_S2_R1_001.fastq.gz data/SSCH1-1_ChIP_S1_R1_001.fastq.gz \ + refs/T_thermophila_June2014_assembly.fasta refs/T_thermophila_June2014.gff3 /dev/shm \ + >& output.log) & + +wait diff --git a/hpc/submission.sbatch b/hpc/submission.sbatch new file mode 100644 index 0000000..ea3baf0 --- /dev/null +++ b/hpc/submission.sbatch @@ -0,0 +1,31 @@ +#!/bin/bash -l +##SBATCH -p debug +#SBATCH -N 1 +#SBATCH --ntasks-per-node=40 +#SBATCH -t 1:00:00 +#SBATCH -J RACS_negative_control +#SBATCH --mail-user=YOURemailHERE +#SBATCH --mail-type=ALL + + +# load needed modules // RACS repository located on $SCRATCH +. $SCRATCH/RACS/hpc/modules + +echo $SLURM_SUBMIT_DIR +cd $SLURM_SUBMIT_DIR + + +date + +# RACS repo located on $SCRATCH +# data subdirectory containing data files: INPUT and IP files +# refs subdirectory containing reference files: fasta and gff3 files +# the pipeline will attempt to use RAMDisk (/dev/shm) and autodetect maximum number of cores to use +($SCRATCH/RACS/core/countReads.sh \ + data/SSCH1-1_Input_S2_R1_001.fastq.gz data/SSCH1-1_ChIP_S1_R1_001.fastq.gz \ + refs/T_thermophila_June2014_assembly.fasta refs/T_thermophila_June2014.gff3 /dev/shm \ + >& output.log; \ +echo $?; \ +date) & + +wait diff --git a/tools/BD1_peaks.xlsx b/tools/BD1_peaks.xlsx new file mode 120000 index 0000000..cd5a2a5 --- /dev/null +++ b/tools/BD1_peaks.xlsx @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/BD1_peaks.xlsx \ No newline at end of file diff --git a/tools/BD1_peaks_MACS2.xlsx b/tools/BD1_peaks_MACS2.xlsx new file mode 120000 index 0000000..13acbb7 --- /dev/null +++ b/tools/BD1_peaks_MACS2.xlsx @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/BD1_peaks_MACS2.xlsx \ No newline at end of file diff --git a/tools/BD2_peaks.xlsx b/tools/BD2_peaks.xlsx new file mode 120000 index 0000000..6c015a3 --- /dev/null +++ b/tools/BD2_peaks.xlsx @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/BD2_peaks.xlsx \ No newline at end of file diff --git a/tools/BD2_peaks_MACS2.xlsx b/tools/BD2_peaks_MACS2.xlsx new file mode 120000 index 0000000..77e583b --- /dev/null +++ b/tools/BD2_peaks_MACS2.xlsx @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/BD2_peaks_MACS2.xlsx \ No newline at end of file diff --git a/tools/Ibd1-1.csv b/tools/Ibd1-1.csv new file mode 120000 index 0000000..6a80a15 --- /dev/null +++ b/tools/Ibd1-1.csv @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/Ibd1-1.csv \ No newline at end of file diff --git a/tools/Ibd1-2.csv b/tools/Ibd1-2.csv new file mode 120000 index 0000000..f2de721 --- /dev/null +++ b/tools/Ibd1-2.csv @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/Ibd1-2.csv \ No newline at end of file diff --git a/tools/Ibd1.csv b/tools/Ibd1.csv new file mode 120000 index 0000000..1c9c41f --- /dev/null +++ b/tools/Ibd1.csv @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/Ibd1.csv \ No newline at end of file diff --git a/tools/Ibd1.xlsx b/tools/Ibd1.xlsx new file mode 120000 index 0000000..0534352 --- /dev/null +++ b/tools/Ibd1.xlsx @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/Ibd1.xlsx \ No newline at end of file diff --git a/tools/Ibd2-1.csv b/tools/Ibd2-1.csv new file mode 120000 index 0000000..0c9765e --- /dev/null +++ b/tools/Ibd2-1.csv @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/Ibd2-1.csv \ No newline at end of file diff --git a/tools/Ibd2-2.csv b/tools/Ibd2-2.csv new file mode 120000 index 0000000..cb4a5a7 --- /dev/null +++ b/tools/Ibd2-2.csv @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/Ibd2-2.csv \ No newline at end of file diff --git a/tools/Ibd2.csv b/tools/Ibd2.csv new file mode 120000 index 0000000..6255f48 --- /dev/null +++ b/tools/Ibd2.csv @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/Ibd2.csv \ No newline at end of file diff --git a/tools/Ibd2.xlsx b/tools/Ibd2.xlsx new file mode 120000 index 0000000..e66b7ac --- /dev/null +++ b/tools/Ibd2.xlsx @@ -0,0 +1 @@ +../../../../chipseq-pipeline/compare2macs/Ibd2.xlsx \ No newline at end of file diff --git a/tools/compare.R b/tools/compare.R new file mode 100644 index 0000000..ccda1b3 --- /dev/null +++ b/tools/compare.R @@ -0,0 +1,318 @@ +######################################################################### +# RACS: Comparison Tools +# +# RACS utilities for comparison with MACS2 results +# +# requires "utils_RACS-compare.R" utilties file with fn defns +# +# +# +# HOW TO USE this script: +# +# Rscript interGeneRegions.R inputFile refFILE.gff3 outFile +# +# +######################################################################## +# +# +# Function to establish the location of the script... hence the relative path to the utilities file +# this function needs to be in the main driver script! +# + + +######################################################################### + +# function to load data from a spreadsheet format +load.data <- function(filename,spix=1) { + + # 1st check that the file exist + checkFile <- function(filename) { + while (!file.exists(filename)) { + cat(paste(filename,"not found!",'\n')) + cat("Please select a file") + filename <- file.choose() + } + return(filename) + } + + # check that the XLSX package is available and load it + loadCheckPkg("xlsx") + + cat(paste("Loading file",filename,'\n')) + filename <- checkFile(filename) + + # reading data + sample <- read.xlsx(filename,spix) + return(sample) +} + + +# function to check whether a package is available in the system +loadCheckPkg <- function(pckgs){ + for (pckg in pckgs) { + # check whether the package is NOT loaded + if (! paste('package:',pckg,sep="") %in% search()) { + # check whether the package is avaialble in the system + if (pckg %in% .packages(all.available = TRUE)) { + # load the package + cat("Loading library",pckg,"... \n") + library(pckg, character.only=TRUE) + } else { + msg <- paste("Package:",pckg, "not found! You need to install this package using",paste("install.package('",pckg,"')",sep="")) + stop(msg) + } + } + } +} + +####### + + +### TESTS CASES +sample1 <- load.data("BD1_peaks.xlsx",1) +sample2 <- load.data("BD1_peaks_MACS2.xlsx",1) + + +# ACTUAL DATA +sampleIBD1 <- read.csv("Ibd1-2.csv",sep='\t') +sampleIBD2 <- read.csv("Ibd2-2.csv",sep='\t') + +macsIBD1 <- load.data("BD1_peaks_MACS2.xlsx",1) +macsIBD2 <- load.data("BD2_peaks_MACS2.xlsx",1) + +## IBD1/2 cases +ibdX <- function(sample,ref,filename="ibdX.pdf") { + ibdX <- comparison(sample,ref,FALSE) + pdf(file=filename) + vizDiffs(ibdX) + dev.off() +} +##### + +tests <- function() { + sample1 <- data.frame() + sample1 <- rbind(sample1, c(as.character("scfld_01"), as.numeric(1),as.numeric(101)) ) + +} + +####### + +# MAIN fucntions of the comparison tool + +overlap <- function(x1,x2, y1,y2){ + # NO overlap condition + if (x1 > y2) return(+1.0) + if (x2 < y1) return(-1.0) + + ax = (x2 - x1) + ay = (y2 - y1) + #print(ax) + #print(ay) + + #if ((x1<=y1) & (x2>=y2)) return(+ax/ay) + if (x1<=y1) return(+ax/ay) + #if ((x1>y1) & (x2<y2)) return(-ay/ax) + if (x1>y1) return(-ay/ax) + if ((x1==y1)&(x2==y2)) return(0) + + return(NA) +} + +###### + +comparison.ALL <- function(sample1,sample2) { + + # identify scaffolds + scflds_smpl1 <- unique(as.character(sample1$Region)) + scflds_smpl2 <- unique(as.character(sample2$Region)) + + ### 1-to-1 comparison + results <- data.frame() + for (scfld in scflds_smpl1) { + #scfld <- scflds_smpl1[1] + print(scfld) + # define subsets based on the scalfold to work with + sset1 <- sample1[sample1$Region == scfld,] + sset2 <- sample2[sample2$Region == scfld,] + # order to subsets per intervals + sset1 <- sset1[order(sset1$start),] + sset2 <- sset2[order(sset2$start),] + + for (i1 in 1:dim(sset1)[1]){ + for (i2 in 1:dim(sset2)[1]) { + ovrlap <- overlap(sset1$start[i1],sset1$end[i1], sset2$start[i2], sset2$end[i2]) + if (is.numeric(ovrlap)) { + cat(i1,i2,'\n') + print(ovrlap) + results <- rbind(results,c(scfld,sset1$start[i1],sset1$end[i1],sset2$start[i2], sset2$end[i2], ovrlap)) + } + } + } + } + colnames(results) <- c("scfld","x1","x2","y1","y2","overlap") + results[abs(results$overlap) != 1,] + return(results) +} + +######## + +comparison <- function(sample1,sample2, DBG=TRUE) { + + # sort samples per scafold + sample1 <- sample1[order(sample1$Region),] + sample2 <- sample2[order(sample2$Region),] + + # identify scaffolds + scflds_smpl1 <- unique(as.character(sample1$Region)) + scflds_smpl2 <- unique(as.character(sample2$Region)) + scflds <- c(scflds_smpl1,scflds_smpl2) + scflds <- unique(scflds[order(scflds)]) + + ### 1-to-region comparison + results <- c() + for (scfld in scflds_smpl1) { +# for (scfld in scflds) { + #scfld <- scflds_smpl1[1] + print(scfld) + # define subsets based on the scalfold to work with + sset1 <- sample1[sample1$Region == scfld,] + sset2 <- sample2[sample2$Region == scfld,] + # order to subsets per intervals + sset1 <- sset1[order(sset1$start),] + sset2 <- sset2[order(sset2$start),] + dsset1 <- dim(sset1)[1] + dsset2 <- dim(sset2)[1] + + if (dsset1 == 0) { + if (DBG) { + # not data registered for this scaffold in sample1 + cat("not data registered for this scaffold:",scfld," in sample1",'\n') + print(sset1) + print(sset2) + } + tmpreg <- c(as.character(scfld),"--","--", sset2$start,sset2$end, as.numeric(-1)) + if (DBG) print(tmpreg) + results <- rbind(results, as.character(tmpreg)) + } else if (dsset2 == 0 ) { + if (DBG) { + cat("not data registered for this scaffold:",scfld," in sample2",'\n') + print(sset1) + print(sset2) + } + tmpreg <- c(as.character(scfld),"--","--", sset1$start,sset1$end, as.numeric(+1)) + if (DBG) print(tmpreg) + results <- rbind(results, as.character(tmpreg)) + } else { + for (i1 in 1:dsset1){ + regResults <- c() + for (i2 in 1:dsset2) { + ovrlap <- overlap(sset1$start[i1],sset1$end[i1], sset2$start[i2], sset2$end[i2]) + if (!is.na(ovrlap)) { + cat(scfld,i1,i2,"---",sset1$start[i1],sset1$end[i1], sset2$start[i2], sset2$end[i2],'\n') + print(ovrlap) + newRecord <- c(as.character(scfld),as.numeric(sset1$start[i1]),as.numeric(sset1$end[i1]),as.numeric(sset2$start[i2]), as.numeric(sset2$end[i2]), as.numeric(ovrlap)) + if (DBG) print(paste("<<<<<<",newRecord)) + regResults <- rbind(regResults, as.character(newRecord)) + #regResults <- rbind(regResults, c(as.character(scfld),as.numeric(sset1$start[i1]),as.numeric(sset1$end[i1]),as.numeric(sset2$start[i2]), as.numeric(sset2$end[i2]), as.numeric(ovrlap)) ) + } + } + #regResults <- cbind(scfld,regResults) + colnames(regResults) <- c("scafold","x1","x2","y1","y2","overlap") + regResults <- as.data.frame(regResults) + filterCond <- abs(as.numeric(as.character(regResults[!is.na(regResults$overlap),]$overlap))) + if ( dim(regResults[filterCond == 1,])[1] == dsset2 ) { + # not match at all + results <- rbind(results, c(as.character(scfld),sset1$start[i1],sset1$end[i1], "--","--", as.numeric(as.character(regResults$overlap[1])))) + } else { + ##results <- rbind(results, regResults[abs(regResults$overlap) != 1, ]) + #results <- rbind(results, regResults[filterCond != 1,]) + matchingRegs <- regResults[filterCond != 1,] + if (DBG) { cat("######"); print(matchingRegs) ; print(dim(matchingRegs))} + #####if (dim(matchingRegs)[1] > 1) stop() + for (k in 1:dim(matchingRegs)[1]) { + krecord <- matchingRegs[k,] + newMatchingReg <- c(as.character(krecord$scafold),as.character(krecord$x1),as.character(krecord$x2),as.character(krecord$y1),as.character(krecord$y2),as.character(krecord$overlap)) + if (DBG) { cat("###### ::: k ... "); print(newMatchingReg) } + #######results <- rbind(results, c(as.character(matchingRegs$scafold),as.character(matchingRegs$x1),as.character(matchingRegs$x2),as.character(matchingRegs$y1),as.character(matchingRegs$y2),as.character(matchingRegs$overlap)) ) + #results <- rbind(results, c(as.character(scfld),sset1$start[i1],sset1$end[i1],sset2$start[i2], sset2$end[i2],as.numeric(as.character(matchingRegs$overlap))) ) + results <- rbind(results,as.character(newMatchingReg)) + } + } + } + } + } + colnames(results) <- c("scafold","x1","x2","y1","y2","overlap") + rownames(results) <- c() + results <- as.data.frame(results) + results$overlap <- as.numeric(as.character(results$overlap)) + results[abs(results$overlap) != 1,] + + return(results) +} + +###### + +vizDiffs <- function(results, filename="", threshold=1000, iplots=FALSE) { + res <- results[!is.na(results$overlap),] + res <- res[res$overlap<threshold,] + yvar <- res$overlap + + # Determine max/min in plots + mmin <- round(min(yvar)) + mmax <- round(max(yvar)) + mm <- max(abs(mmin),abs(mmax)) + yrange <- c(-mm,+mm) + xrange <- c(1, length(yvar)) + + # interactive plot + if (iplots) { + loadCheckPkg("plotly") + #loadCheckPkg("pandoc") + p1 <- plot_ly(x = c(results$scafold), y = results$overlap, type="bar") + p2 <- plot_ly(y = results$overlap, group=results$scafold, type="bar") + p3 <- plot_ly(x=~results$scafold, y = results$overlap, group=results$scafold, type="bar") + + htmlwidgets::saveWidget(as.widget(p1), paste(filename,"-p1.html"), selfcontained=FALSE) + htmlwidgets::saveWidget(as.widget(p2), paste(filename,"-p2.html"), selfcontained=FALSE) + htmlwidgets::saveWidget(as.widget(p3), paste(filename,"-p3.html"), selfcontained=FALSE) + } + + # static plots + if (filename!="") pdf(file=filename) + if (filename=="") dev.new() + plot(res$scafold,yvar) + abline(h = 0, v = 0, col = "gray60") + + if (filename=="") dev.new() + #par(mfrow=c(2,1)) + barplot(res$overlap, ylim=yrange) + box() + par(new=T) + plot(yvar, cex=0.5, ylim=yrange, ann=F, axes=F) + abline(h = 0, v = 0, col = "gray60") + + if (filename=="") dev.new() + par(mfrow=c(2,1)) + plot(res$overlap, cex=0.5, xlim=xrange, ylim=yrange, type='b') + abline(h = 0, v = 0, col = "gray60") + #par(new=TRUE) + plot(which(res$overlap>0), res[res$overlap>0,]$overlap, cex=.5, col='blue', xlim=xrange, ylim=yrange) + par(new=TRUE) + plot(which(res$overlap<0), res[res$overlap<0,]$overlap, cex=.5, col='red', xlim=xrange, ylim=yrange) + abline(h = 0, v = 0, col = "gray60") + + + if (filename=="") dev.new() + par(mfrow=c(1,1)) + g <- yvar + h <- hist(g, freq=TRUE, breaks = 75, xlim=c(-50,50), density = 40, col = " lightgray" , xlab = "overlap", main = "RACS vs MACS") + xfit <- seq(min(g), max(g), length = length(g)) + yfit <- dnorm(xfit, mean = mean(g), sd = sd(g)) + yfit <- yfit * diff(h$mids[1:2]) * length(g) + lines(xfit, yfit, col = "black", lwd = 2) + text(0,max(yfit*1.10),paste("mean=",mean(g), " -- ", "sd=",sd(g))) + + if (filename!="") dev.off() +} + +###### diff --git a/tools/ibdX.pdf b/tools/ibdX.pdf new file mode 100644 index 0000000..7f2a4d2 Binary files /dev/null and b/tools/ibdX.pdf differ