Version 1.0, MArch 2019

Marcelo Ponce [2019-03-11 19:20:23]

Version 1.0, MArch 2019

Filename
AUTHORS
CITATION
LICENSE
README
WARRANTY
core/auxs/auxFns.sh
core/auxs/testFns.sh
core/comb_tables.sh
core/comb_tables.sh~
core/countReads-wFns.sh
core/countReads.sh
core/intergenic/det-interGenes-ORIG.sh
core/intergenic/det-interGenes.sh
core/intergenic/interGeneRegions.R
core/intergenic/interGenes.sh
core/intergenic/utils_RACS-IGR.R
core/normalizeORF.sh
core/table.sh
core/test/lst
datasets/IBD1_Intergenic.xlsx
datasets/IBD1_ORF.xlsx
datasets/PostProcessing_Genic.xlsx
datasets/PostProcessing_Intergenic.xlsx
hpc/IGR-jobs_parallel.gnu
hpc/modules
hpc/submission.pbs
hpc/submission.sbatch
tools/BD1_peaks.xlsx
tools/BD1_peaks_MACS2.xlsx
tools/BD2_peaks.xlsx
tools/BD2_peaks_MACS2.xlsx
tools/Ibd1-1.csv
tools/Ibd1-2.csv
tools/Ibd1.csv
tools/Ibd1.xlsx
tools/Ibd2-1.csv
tools/Ibd2-2.csv
tools/Ibd2.csv
tools/Ibd2.xlsx
tools/compare.R
tools/ibdX.pdf

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..6a85685
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,3 @@
+MARCELO PONCE (mponce@scinet.utoronto.ca)
+ALEJANDRO SAETTONE (alejandro.saettone@ryerson.ca)
+
diff --git a/CITATION b/CITATION
new file mode 100644
index 0000000..9e515e9
--- /dev/null
+++ b/CITATION
@@ -0,0 +1,2 @@
+Please cite our paper (CITE_RACS) when ever you are using RACS.
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..74fc02c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,24 @@
+Copyright 2018 Marcelo Ponce, Alejandro Saettone
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+- Please cite our paper (CITE_RACS) when ever you are using RACS.
+
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
diff --git a/README b/README
new file mode 100644
index 0000000..227db41
--- /dev/null
+++ b/README
@@ -0,0 +1,358 @@
+RACS: Rapid Analysis of ChIP-Seq data for contig based genomes
+
+These tools are a serie of scripts developed to facilitate the analysis
+of ChIP-Seq data and has been applied to the organism T. Thermophila.
+
+The scripts should work in any Linux-type OS (ie. Mac OS and Linux OS).
+It requires to have installed the following programs:
+	- SAMtools
+	- BWA
+	- R
+
+The first two are needed for the determination of reads in open region frames -ORF- (genic regions).
+While SAMtools and R are needed for the determination of intergenic regions -IGR-.
+
+The main scripts are located in the "core" directory.
+Additionally we provide other subdirectories containing the following:
+	- "hpc": submission scripts for typical job managers and schedulers in HPC environments
+	- "tools": comparison tools developed to compare against other software, eg. MACS
+	- "datasets": examples of the data used to test and run our pipeline
+
+This pipeline is open source under the MIT license, and researchers are welcome to use it.
+We will appreciate if users can let us know if they found bugs or could provide feedback about
+its use and experience using RACS.
+Please cite our paper (CITE_RACS) when ever you are using RACS.
+
+
+
+* INSTALLATION
+
+The scripts composing this pipeline, are available as open source tools,
+and can be obtained from any of the following repositories:
+
+	 https://gitrepos.scinet.utoronto.ca/public/RACS.git
+
+or
+
+	https://bitbucket.org/mjponce/racs
+
+Both repositories are synchronized, so that the latest version of RACS will be avaialble in both
+repositiories simultaneously.
+
+To obtain and install a copy of RACS in your computer, open a terminal (you will need git and
+a internet connection!) and type:
+
+	git clone  https://gitrepos.scinet.utoronto.ca/public/RACS.git
+
+that should clone (download and copy) the latest version of our pipeline in your computer,
+creating a directory called "RACS", containing the following files and sub-directories:
+
+``
+RACS
+ âââ AUTHORS
+ âââ CITATION
+ âââ LICENSE
+ âââ README
+ âââ WARRANTY
+ âââ core
+ |Â Â  âââ countReads.sh
+ |Â Â  âââ table.sh
+ |Â Â  âââ comb_tables.sh
+ |   âââ auxs
+ |   |    âââ auxFns.sh
+ |   |    âââ testFns.sh
+ |   âââ intergenic
+ |   |    âââ det-interGenes.sh
+ |   |    âââ interGeneRegions.R
+ |   |    âââ interGenes.sh
+ |   âââ test
+ |        âââ lst
+ âââ datasets
+ |Â Â  âââ
+ |Â Â  âââ
+ |Â Â  âââ
+ âââ hpc
+ âÂ Â  âââ submission.pbs
+ |Â Â  âââ submission.slurm
+ |   âââ IGR-jobs_parallel.gnu
+ |   âââ modules
+ âââ tools
+  Â Â  âââ
+  Â Â  âââ
+  Â Â  âââ
+''
+
+
+Updating RACS:
+Because the RACS pipeline is available under version control (git), you can easily get updates
+and the latest additions to the RACS pipeline, by simply typing the following command in the
+RACS directory:
+
+	git pull
+
+This will bring into your local installation the latest updates to the pipeline.
+Similarly, any of the git functionalities should work as expected, eg. git log, git status, etc.
+
+
+
+* Integrity checks
+For integrity purposes we have included a series of tools to check whether internal scripts'
+dependencies, as well as, external programs are available in the system so that RACS functionality
+is guaranteed.
+Some of RACS routines will run this integrity checks when ever appropriate at the moment of execution.
+The user can also test this, by executing the following command in the shell:
+
+  PATHtoRACSrepo/core/auxs/testFns.sh
+
+at this point a series of checks will be executed and the results shown in the screen, eg.
+
+*** CHECKING RACS pipeline INTERNAL INTEGRITY...
+         countReads.sh ... ok!
+         table.sh ... ok!
+         comb_tables.sh ... ok!
+         intergenic/interGenes.sh ... ok!
+         intergenic/interGeneRegions.R ... ok!
+         intergenic/det-interGenes.sh ... ok!
+         auxs/testFns.sh ... ok!
+         auxs/auxFns.sh ... ok!
+*** CHECKING EXTERNAL DEPENDENCIES with auxiliary software: BWA, SAMTOOLS & RSCRIPT...
+         bwa ... found!
+         samtools ... found!
+         Rscript ... found!
+
+If either one of the dependencies is not meet an error message describing the issue will be displayed,
+followed by the basic instructions in how to use RACS.
+
+
+
+* HOW TO USE THE 'racs' PIPELINE:
+
+RACS offers two main functionalities:
+- count reads in ORF
+and
+- identify reads in intergenic regions (using information about the biology of the model)
+
+
+* Counting Reads in ORF:
+
+- "countReads": countReads.sh is the main script in the pipeline, it is a shell
+  script in charge of implementing and distributing the actual pipeline to count
+the reads. It combines several instances of shell commands and offloads
+to the packages SAMtools and BWA.
+This script can utilize multi-core parallelization when possible via threads
+which can be explicitly input by the user or automatically detected by the script.
+
+At the end of the execution the script will produce a table indicating XXXXX.
+The resulting files, some intermediate files and the final tables, are located
+in a directory created in the directory from where the script is executed named
+'ORF_RACS_results-YYYYMMDD-HHMMSS' where 'YYYYMMDD-HHMMSS' indicates the year-month-day
+and time when the script was executed.
+
+
+Inputs:
+	- INPUT file (fastq.gz)
+	- IP file (fastq.gz)
+	- fasta assembly file (.fasta)
+	- reference genome file (.gff3)
+
+Outputs:
+	- several sam, bam and bai files for each input (INPUT and IP) files
+	- intermediate tables with reads for INPUT and IP files, named 'tableReads{INPUT/IP}.{input/ip-filename}'
+	- final table sorted by scaffold and localization within the scaffold,
+	named 'FINAL.table.{INPUTfilename}-{IPfilename}
+
+
+When executing the scripts, please indicate the full path to the location of
+the script.  In this way the script will determine where the repository is
+located and find the different tools needed during the execution.
+Also, please do not "source" the scripts, they are set to be run as executable scripts,
+ie. execute the scripts in this way,
+
+	PATHtoRACS/core/../scriptNAME.sh  args
+
+do *NOT* execute them in this way,
+
+	. PATHtoRACS/core/../scriptNAME.sh  args
+
+NOR in this way neither,
+
+	source PATHtoRACS/core/../scriptNAME.sh  args
+
+Our scripts will also try to detect this situation and prevent the user from doing this.
+Due to the modular implementation we designed for RACS, and in order to allow RACS
+locating their main components, we need to have the scripts executed as described above
+and not sourced.
+
+
+Integrity & Sanity Checks
+In each of the main RACS scripts we have incorporated several integrity checks which
+are run at the very beginning of the scripts' execution. These tests include
+determining whether the tools used (eg. BWA, SAMTOOLS or R) are installed in the
+system and available for being used within the pipeline.
+Similarly, by checking the location of the script, the pipeline verifies that the
+other components of the pipeline are also in place and can be found so that the pipeline
+can run without any problems.
+In this way, there is no need to add the different scripts of the pipeline to the
+PATH and the scripts are selfaware of where they are placed.
+For achieving this, the scripts will need to be called specifying its full location in the computer.
+
+The different scripts in the pipeline will also check that the arguments specified,
+in particular when these suppose to be existent files, they actually are!
+We basically tried our best to implement defensive programming all across the different scripts
+that compose the RACS pipeline, to protect its proper execution and help the user to establish
+the proper way of using RACS.
+We also included a simple testing routine, in the 'aux' subdir, that can be used to run
+some of the integrity tests that the pipeline will be checking during execution,
+as described in the section above.
+
+
+Arguments to the script:
+  1st argument: file with INPUT reads, usually a ".fasta.gz" file, obtained from the sequencer
+  2nd argument: file with IP reads, usually a ".fasta.gz" file, obtained from the sequencer
+  3rd argument: reference genome file (fasta)
+  4th argument: annotation file (gff3)
+  5th argument: working space (if possible use RAMdisk --ie. /dev/shm/--, or
+	/tmp in a computer with SSD)
+  6th argument (optional): number of cores to use for BWA multi-threading.
+	If this argument is not specified, the script will attempt to determine
+	automatically the number of cores and use that number to run in multi-threading.
+	If you want to run the script serially, ie. with just one core, you should
+	set this argument to 1.
+
+
+The main output file, FINAL.table."INPUTfile"-"IPfile"
+(where "INPUTfile" and "IPfile" refer to the corresponding INPUT and IP files),
+is ordered by the scaffold localization.
+Notice that this file <<FINAL.table."INPUTfile"-"IPfile">>, is the one that
+will be used in the second part of the pipeline when detecting the intergenic regions.
+This final file is a CSV text file containing the following columns:
+
+	location	name	description	geneSize	INPUT.reads	IP.reads
+
+where
+  "location" is the scaffold location of the gene
+  "name" is the name of the gene
+  "description" is a description of the gene
+  "geneSize" represents the size of the gene
+  "INPUT.reads" is the number of reads (calls) for the INPUT file
+  "IP.reads" is the number of reads (calls) for the IP file
+
+
+
+* Normalization of ORF by PF Cluster Scores
+If your data contains the Passing Filter (PF) Cluster scores, you can use an additional shell script
+that we have included to normalize your INPUT/IP-reads.
+This script is called "normalizedORF.sh" and is located in the core subdirectory.
+The script requires three mandatory arguments:
+	1st argument: "FINAL.table.*"  file generated from the RACS' ORF pipeline
+	2nd argument: "PF-INPUT-value"  PF value correspoding to the INPUT file
+	3rd argument: "PF-IP-value"  PF value correspoding to the IP file
+
+Please notice that arguments 2 and 3, are the acutal numerical values corresponding to the PF clusters
+for the INPUT and IP respectively.
+
+
+* Determination of InterGenic Regions
+The second main functionality of our RACS pipeline is the ability to determine
+intergenic regions based on the results from the ORF reads and the biology of the
+model organism.
+For achieving this second part, we combined a series of shell scripts and R scripts.
+Similarly as before the pipeline when executed will perform some sanity and integrity
+checks to guarantee that all the necessary pieces are in place.
+All the scripts for this part of the pipeline are located in the 'intergenic' subdir.
+The flow's director is a simple shell script, 'det-interGenes.sh', which will receive
+four command line arguments and call an R script, 'interGenicRegions.R' and
+a second shell script by the end of the execution, 'interGenes.sh'.
+The R script, 'interGenicRegions.R', is the actual brain and main script in charge of
+determining regions among genic boundaries.
+It is designed in a modular fashion, implementing the script in a main driver script
+(the 'interGenicRegions.R' itself) and an utility file where all the functions used in
+the main driver are defined.
+The second shell script, 'interGenes.sh', is in charge of computing the number of
+reads/calls within the determined intergenic regions, for which it also uses SAMtools.
+The main shell script, keeps track of the time employed in each stage of the process,
+and offers a summary of it at the end of the execution.
+
+Arguments to the script:
+	arg1: final combined table generated by the ORF from the RACS pipeline
+	arg2: reference genome file (gff3)
+	arg3: name of the file where to save the InterGenic Regions
+	arg4: text file containing the name of the output file from ORF part/tag.(ref.) name; eg.
+		alnDATASET_INDICATORS_ChIP_SXX_RY_ZZZ.fastq.gz-sorted.bam
+		alnDATASET_INDICATORS_Input_SXX_RY_ZZZ.fastq.gz-sorted.bam
+
+
+
+
+
+Examples:
+I) calling peaks for ORF
+I.i) the following command will run the countReads.sh script using:
+	- 'data2/_1_MED1_INPUT_S25_L007_R1_001.fasta.gz'  as the file with the INPUT reads
+	- 'data2/_3_MED1_IP_S27_L007_R1_001.fasta.gz' as the file with IP reads
+	- 'T_thermophila_June2014_assembly.fasta' as the reference genome for the T.Thermophila organism
+	- 'T_thermophila_June2014.gff3' as the annotation file for the T.Thermophila
+	- '/tmp/' as working space
+
+	PATHtoRACSrepo/core/countReads.sh   data2/_1_MED1_INPUT_S25_L007_R1_001.fastq.gz  data2/_3_MED1_IP_S27_L007_R1_001.fastq.gz  T_thermophila_June2014_assembly.fasta  T_thermophila_June2014.gff3  /tmp/
+
+
+I.ii) the following command will run the countReads.sh script using the same
+arguments as before but it is specifying to use "/dev/shm" (RAMdisk) instead
+of "/tmp" as temporary storage for auxiliary files, and 16 threads in the
+parallel regions of the pipeline.
+Additionally it is using the system's 'time' command to time how long the
+pipeline takes to run.
+
+	time  PATHtoRACSrepo/core/countReads.sh   data2/_1_MED1_INPUT_S25_L007_R1_001.fastq.gz  data2/_3_MED1_IP_S27_L007_R1_001.fastq.gz  T_thermophila_June2014_assembly.fasta  T_thermophila_June2014.gff3  /dev/shm/  16
+
+
+II) Determination of InterGenic Regions (IGR)
+II.i) the following command will determine the intergenic regions, using:
+	- 'combinedTABLES_MED1-MED2' as the table which was determined in the ORF part of the pipeline
+	- 'dataset/T_thermophila_June2014.sorted.gff3' as the micro-organism reference genome file
+	- 'interGENs_MED1-MED2.csv' is the name of the table that the IGR part of the pipeline will
+		generate, ie. this will be the output of this part of the pipeline
+	- 'samples.file' is a text file containing the name of the BAM output file, also generated when
+		running the ORF part from RACS; they are usually named as
+                alnDATASET_INDICATORS_ChIP_SXX_RY_ZZZ.fastq.gz-sorted.bam
+                alnDATASET_INDICATORS_Input_SXX_RY_ZZZ.fastq.gz-sorted.bam
+
+	PATHtoRACSrepo/core/intergenic/det-interGenes.sh  combinedTABLES_MED1-MED2  dataset/T_thermophila_June2014.sorted.gff3  interGENs_MED1-MED2.csv  samples.file
+
+
+II.ii) we included a submission script in the 'hpc' directory, named "IGR-jobs_parallel.gnu",
+which will basically scan the current directory digging for sub-directories named "ORF_RACS_*",
+which is the way RACS will name the outcome from running the RACS' ORF pipeline.
+When the script detects one of these directories, it will look inside it and will generate the
+corresponding 'samples.file' containing the name of all the aln*fastq.gz-sorted.bam files within
+this directory.
+When the search for ORF sub-directories is done, it will launch the IGR part of the pipeline in
+*parallel* for ALL the IGR results using an 'embarrassingly parallel' approach via GNU-Parallel.
+Assuming you are located in a directory containing several ORF sub-directories, you will just run it
+in this way,
+
+	PATHtoRACSrepo/hpc/IGR-jobs_parallel.gnu
+
+In principle the script does not require any command line argument, but it contains several environment
+variables defined that should be adjusted for the particular system where it will run.
+Ie.
+	- RACSloc="location where RACS is installed in your system"
+	- REFgff3="location where the genome reference file of the organism is located in your system"
+
+Additionally one could adjust the following variables:
+	- PARALLEL_TASKS_PER_NODE="number of parallel jobs to run at the same time", although
+		by the default the script will try to determine the maximum number of cores available
+		on the system
+	- ORFdirs="ORF_RACS_results-", matching pattern for the beginning of the directories generated
+		by the RACS' ORF
+
+
+III) normalization and cut-off with negative control
+
+
+IV) normalization and cut-off without a negative control
+
+
+V) comparison to MACS and XXXX
+
diff --git a/WARRANTY b/WARRANTY
new file mode 100644
index 0000000..1b09e22
--- /dev/null
+++ b/WARRANTY
@@ -0,0 +1,8 @@
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/core/auxs/auxFns.sh b/core/auxs/auxFns.sh
new file mode 100755
index 0000000..965336e
--- /dev/null
+++ b/core/auxs/auxFns.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+## -vx
+
+# auxiliary set of functions used in other scripts
+# it includes:
+#	- welcome(): RACS welcome message and credits
+#
+#	- helpMsg(): help message describing how to use this script
+#
+#	- usage(): an improved self-documented help fn based on the same scripts' comments
+#
+#	- checkTools(): fn to check the availability of a given tool
+#			by looking whether such an executbale/program is present in the current system;
+#			the fn can process several arguments at once
+#
+#	- checkIntegrityPipeline(): fn to check internal scripts to the pipeline,
+#			the fn will check whether an specific script exists and has an executable attribute;
+#			the fn can process several arguments at once
+#
+#################################
+
+
+### functions ########################################
+
+##### Informative fucntions
+welcome() {
+	echo "RACS v1.0 (2018/2019) -- Open source tools for Analizing ChIP-Seq data"
+	echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+
+}
+
+##
+
+helpMsg(){
+	scriptsDIR=`dirname $0`
+	welcome
+	echo $scriptsDIR
+	echo "How to use this script:"
+	more $scriptsDIR/../../README
+	cancel;
+}
+##
+usage() {
+	shebang="#!/bin/bash"
+	scriptName=`basename $0`
+	scriptsDIR=`dirname $0`
+	welcome
+	echo "************************"
+	echo $scriptName
+	echo "************************"
+	echo "How to use this script:"
+	upto=$(grep -v $shebang $0 | grep -n "####################" | head -1 | awk -F":" '{print $1}')
+	grep -v $shebang $0 | head -$upto | more
+	exit 222
+}
+
+##
+
+##### Checking/Testing Functions
+checkTools(){
+	echo "checking for tools required for RACS..."
+	for tool in "$@"; do
+		# look for the 'tool' in executable path
+		statusTOOL=$(which $tool);
+		# if exit-status is non-zero tool NOT found!
+		[ -z $statusTOOL ] && errMsg "$tool needs to be installed for using this pipeline!" || echo "	 $tool ... found!"
+	done
+}
+
+#####
+
+checkIntegrityPipeline(){
+	echo "Verifying RACS integrity..."
+	scriptsDIR=`dirname $0`
+	for dep in "$@"; do
+		# this will make the fn look in the location from where the fn is being called, ie. relative to the location of the calling script...
+		testing=$scriptsDIR/$dep
+		#echo $testing
+		# check file exists and has execution attribute
+		[ -x $testing ]  && echo "	 $dep ... ok!" || errMsg "err.critical: missing '$dep' from pipeline directory $scriptsDIR: $testing"
+	done
+}
+
+#########
+
+#####
+# Functions for checking command-line arguments
+#####
+
+# Function that checks whether the argument is an existing file
+checkFile() {
+	for file in "$@"; do
+		echo 'verifying file...' $file
+		[ ! -f $file ] && errMsg "File $file NOT found!"
+	done
+}
+
+
+#########
+
+# Function that checks that the argument is of type numeric, ie. an integer number
+checkNbr() {
+	numericDigits='^[0-9]+$'
+	if ! [[ $1 =~ $numericDigits ]] ; then
+		errMsg "$2 argument should be a positive integer number! Check your PF cluster information!"
+	fi
+}
+
+#########
+
+#Function that checks whether the argument is of a particular type
+checkOption() {
+	if [[ $1 != "A" && $1 != "D" ]] ; then
+		errMsg "$2 argument should be either 'A' for ascending or 'D' for decreasing order!"
+	fi
+}
+
+#########
+
+######
+# Other miscellaneous fns
+######
+
+
+# Function to detect number of cores
+
+detectCores() {
+	nbrCores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l)
+	echo $nbrCores
+}
+
+
+# functions to time execution
+
+tick() {
+	t0=`date +%s`
+}
+
+tock(){
+	t1=`date +%s`
+	echo $((t1-t0))
+}
+
+########################################################
+
+######################################################
+# helper fns for handling error messages
+pausee() { echo "----------------------------------------"; read -p "$*"; }
+msg() { echo "$0: $*" >&2; }
+errMsg() { msg "$*"; pausee 'Press [Enter] to continue...'; usage;  exit 111;}
+try() { "$@" || errMsg "cannot $*"; }
+#######################################################
+
+######################################################
+
+# ====================================================
+myDIR=$(pwd)
+resultsDIR=$myDIR/results-`date '+%Y%m%d-%H%M%S'`	#`date +%D-%T`
+
+
diff --git a/core/auxs/testFns.sh b/core/auxs/testFns.sh
new file mode 100755
index 0000000..3ce95cd
--- /dev/null
+++ b/core/auxs/testFns.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+
+# unit testing (integrity tests) for the pipeline
+#
+# script containing a series of integrity checks: internal and external dependencies of the pipeline
+
+# determine where the scripts are located
+scriptsDIR=`dirname $0`
+
+# load auxiliary fns
+. $scriptsDIR/auxFns.sh --source-only
+
+
+main() {
+	# integrity checks
+	echo "*** CHECKING RACS pipeline INTERNAL INTEGRITY..."
+	for i in `cat $scriptsDIR/../test/lst`; do
+		#echo $i;
+		checkIntegrityPipeline	../$i;
+	done
+
+	# external dependencies
+	echo "*** CHECKING EXTERNAL DEPENDENCIES with auxiliary software: BWA, SAMTOOLS & RSCRIPT..."
+	# checking requirements for the ORF part of the RACS pipeline
+	checkTools bwa samtools
+	# checking requirements for the intergenic part of the RACS pipeline
+	checkTools samtools Rscript
+}
+
+
+if [ "${1}" != "--source-only" ]; then
+	main "${@}"
+fi
+
+#########
diff --git a/core/comb_tables.sh b/core/comb_tables.sh
new file mode 100755
index 0000000..65984cd
--- /dev/null
+++ b/core/comb_tables.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+###-xv
+
+# comb_tables.sh 	---	RACS ORF pipeline
+#
+# auxiliary internal script to RACS ORF pipeline, called from countReads.sh
+#
+# requires 3 arguments:
+# 1) INPUTfile (eg. BD1_INPUT)
+# 1) IPfile (eg. BD1_IP)
+# 1) T_THERM file (eg. T_thermophila_June2014.gff3)
+INPUTfile=`basename $1 .fastq.gz`
+IPfile=`basename $2 .fastq.gz`
+TTHERMfile=$3
+
+tableINPUTs=tableReadsINPUT.$INPUTfile
+
+tableIPs=tableReadsIP.$IPfile
+
+
+bigTABLE=TABLEE
+#normalizedTABLE=normalized.table.$IPfile-$INPUTfile
+#finalTABLE=FINAL.table.$IPfile-$INPUTfile
+IxFILES=$INPUTfile-$IPfile
+normalizedTABLE=normalized.table.$IxFILES
+finalTABLE=FINAL.table.$IxFILES
+
+# combine tables generated by "table.sh" and "alejandro.sh"
+paste table.$TTHERMfile $tableINPUTs $tableIPs > $bigTABLE
+
+# compute normalized quantities
+MIN=$(sort -n -k 4 $bigTABLE  | head -1 | awk '{print $4}')
+
+# find how many genes have MIN gene size
+MINgenes=$(awk -v min="$MIN" '{ if($4==min) print $0}' $bigTABLE  | wc -l)
+echo "Minimum gene size found: "$MIN
+echo "Number of genes with MIN gene size "$MINgenes
+
+# $5: nbr reads for INPUT, $6: nbr of reads for IP; $4: gene size
+##awk -v min="$MIN" '{print $5*min/$4" "$6*min/$4}' $bigTABLE > $normalizedTABLE
+##awk '{print $5*150./$4"\t"$6*150./$4}' TABLEE > normalizedTABLE
+#paste $bigTABLE $normalizedTABLE > $FINALtable
+### this calculation was doing a previous normalization using the MIN.gene.size
+### which is obsolete and NOT needed anymore...
+### awk -v min="$MIN" '{print $0 $5*min/$4"\t"$6*min/$4}' $bigTABLE
+
+
+#awk '{if ($7==0)  print "--"; else print $8/$7}' FINALtable  > scores
+#paste $TABLEE $normalizedTABLE | awk '{if ($7==0)  print "--"; else print $8/$7}' > $scores
+#paste FINALtable scores | sort -k 9 -n  > SUPERfinal
+#paste $TABLEE $normalizedTABLE | awk '{if ($7==0)  print "--"; else print $8/$7}' | sort -k 9 -n
+
+### this calculation was doing a previous normalization using the MIN.gene.size
+### which is obsolete and NOT needed anymore...
+### awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0)  print $0"\t""--"; else print $0"\t"$8/$7}'   >  $finalTABLE--SORTED
+# in principle, there is only one final table ordered by scaffold loc.
+#cp  $bigTABLE  $finalTABLE--SORTED
+
+### awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0)  print $0"\t""--"; else print $0"\t"$8/$7}' | sort -k 9 -n   >  $finalTABLE
+mv $bigTABLE $finalTABLE
+
+### ADD headers to tables...
+#header="location \t name \t description \t GeneSize \t INPUT.reads \t IP.reads \t norm.INPUTs \t norm.IPs \t IP-INPUT.ratio"
+header="location \t name \t description \t GeneSize \t ${1}_reads \t ${2}_reads"
+#for table in {$finalTABLE--SORTED,$finalTABLE}; do
+for table in $finalTABLE; do
+	echo -e $header | cat - $table > tmp && mv tmp $table
+done
+
+
+####
+# for generating table to use with intergenic regions script:
+#paste FINAL.table.BD2_IP_NGS-BD2_INPUT_NGS--UNSORTED /scratch2//s/scinet/mponce/Tetrahymena_Ryerson/1stAttempt/FINAL.table.BD1_IP_NGS-BD1_INPUT_NGS--UNSORTED | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"(($9+$18)*.5) }'  > combinedTABLES_BD1-BD2
diff --git a/core/comb_tables.sh~ b/core/comb_tables.sh~
new file mode 100644
index 0000000..840d4f4
--- /dev/null
+++ b/core/comb_tables.sh~
@@ -0,0 +1,63 @@
+#!/bin/bash
+###-xv
+
+# requires 3 arguments:
+# 1) INPUTfile (eg. BD1_INPUT)
+# 1) IPfile (eg. BD1_IP)
+# 1) T_THERM file (eg. T_thermophila_June2014.gff3)
+INPUTfile=`basename $1 .fastq.gz`
+IPfile=`basename $2 .fastq.gz`
+TTHERMfile=$3
+
+tableINPUTs=tableReadsINPUT.$INPUTfile
+
+tableIPs=tableReadsIP.$IPfile
+
+
+bigTABLE=TABLEE
+#normalizedTABLE=normalized.table.$IPfile-$INPUTfile
+#finalTABLE=FINAL.table.$IPfile-$INPUTfile
+IxFILES=$INPUTfile-$IPfile
+normalizedTABLE=normalized.table.$IxFILES
+finalTABLE=FINAL.table.$IxFILES
+
+# combine tables generated by "table.sh" and "alejandro.sh"
+paste table.$TTHERMfile $tableINPUTs $tableIPs > $bigTABLE
+
+# compute normalized quantities
+MIN=$(sort -n -k 4 $bigTABLE  | head -1 | awk '{print $4}')
+
+# find how many genes have MIN gene size
+MINgenes=$(awk -v min="$MIN" '{ if($4==min) print $0}' $bigTABLE  | wc -l)
+echo "Minimum gene size found: "$MIN
+echo "Number of genes with MIN gene size "$MINgenes
+
+# $5: nbr reads for INPUT, $6: nbr of reads for IP; $4: gene size
+##awk -v min="$MIN" '{print $5*min/$4" "$6*min/$4}' $bigTABLE > $normalizedTABLE
+##awk '{print $5*150./$4"\t"$6*150./$4}' TABLEE > normalizedTABLE
+#paste $bigTABLE $normalizedTABLE > $FINALtable
+### this calculation was doing a previous normalization using the MIN.gene.size
+### which is obsolete and NOT needed anymore...
+### awk -v min="$MIN" '{print $0 $5*min/$4"\t"$6*min/$4}' $bigTABLE
+
+
+#awk '{if ($7==0)  print "--"; else print $8/$7}' FINALtable  > scores
+#paste $TABLEE $normalizedTABLE | awk '{if ($7==0)  print "--"; else print $8/$7}' > $scores
+#paste FINALtable scores | sort -k 9 -n  > SUPERfinal
+#paste $TABLEE $normalizedTABLE | awk '{if ($7==0)  print "--"; else print $8/$7}' | sort -k 9 -n
+
+awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0)  print $0"\t""--"; else print $0"\t"$8/$7}'   >  $finalTABLE--SORTED
+awk -v min="$MIN" '{print $0"\t"$5*min/$4"\t"$6*min/$4}' $bigTABLE | awk '{if ($7==0)  print $0"\t""--"; else print $0"\t"$8/$7}' | sort -k 9 -n   >  $finalTABLE
+
+
+### ADD headers to tables...
+#header="location \t name \t description \t GeneSize \t INPUT.reads \t IP.reads \t norm.INPUTs \t norm.IPs \t IP-INPUT.ratio"
+header="location \t name \t description \t GeneSize \t ${1}_reads \t ${1}_reads"
+for table in {$finalTABLE--SORTED,$finalTABLE}; do
+	echo -e $header | cat - $table > tmp && mv tmp $table
+done
+
+
+####
+# for generating table to use with intergenic regions script:
+#paste FINAL.table.BD2_IP_NGS-BD2_INPUT_NGS--UNSORTED /scratch2//s/scinet/mponce/Tetrahymena_Ryerson/1stAttempt/FINAL.table.BD1_IP_NGS-BD1_INPUT_NGS--UNSORTED | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"(($9+$18)*.5) }'  > combinedTABLES_BD1-BD2
diff --git a/core/countReads-wFns.sh b/core/countReads-wFns.sh
new file mode 100755
index 0000000..b32cad1
--- /dev/null
+++ b/core/countReads-wFns.sh
@@ -0,0 +1,254 @@
+#!/bin/bash
+# -vx
+
+# main script used to analize and call (count) peaks in the T.Thermophila data.
+# This script also calls 2 other scripts: "table.sh" and "comb_tables.sh"
+# The script requires BWA and samtools to be installed as they are used within the pipeline.
+
+#################################
+### how to use it
+#
+# 1st argument: file with INPUT reads
+# 2nd argument: file with IP reads
+# 3rd argument: reference genome file (fasta)
+# 4th argument: annotiation file (gff3)
+# 5th argument: working space (if possible use RAMdisk --ie. /dev/shm/--, or /tmp in a computer with SSD)
+# 6th argument (optional): number of cores to use for BWA multi-threading
+#
+# Examples:
+# time PATHtoRACSrepo/core/countPeaks.sh   _1_MED1_INPUT_S25_L007_R1_001  _3_MED1_IP_S27_L007_R1_001  T_thermophila_June2014_assembly.fasta  T_thermophila_June2014.gff3  /tmp/  16
+# time PATHtoRACSrepo/core/countPeaks.sh   _1_MED1_INPUT_S25_L007_R1_001  _3_MED1_IP_S27_L007_R1_001  T_thermophila_June2014_assembly.fasta  T_thermophila_June2014.gff3  /dev/shm/  16
+#
+#################################
+
+
+### functions #########################################
+LOChelpMsg(){
+	echo "How to use this script:"
+	cancel;
+}
+#
+#
+LOCcheckTools(){
+	BWA=$(which bwa);
+	[ -z $BWA ] && errMsg "BWA needs to be installed for using this pipeline!"
+	SAMTOOLS=$(which samtools)
+	[ -z $SAMTOOLS ] && errMsg "SAMTOOLS needs to be installed in the system, as it is required in the pipeline!"
+}
+
+#####
+
+LOCcheckIntegrityPipeline(){
+	scriptsDIR=`dirname $0`
+	[ -z $scriptsDIR/table.sh ]  && errMsg "missing 'table.sh' from pipeline directory $scriptsDIR"
+	[ -z $scriptsDIR/comb_tables.sh ] && errMsg "mssing 'comb_tables.sh' from pipeline directory $scriptsDIR"
+}
+
+#
+#######################################################
+# setting preamble, detecting scripts location
+scriptsDIR=`dirname $0`
+
+# load auxiliary fns for integrity checks and message/error handling
+if [[ -f $scriptsDIR/auxs/auxFns.sh ]]; then
+        . $scriptsDIR/auxs/auxFns.sh --source-only;
+else
+        echo "Error auxiliary file: " $scriptsDIR/auxs/auxFns.sh "NOT found!"
+        exit
+fi
+#######################################################
+
+#### CHECKS ###########################################
+welcome
+
+### CHECK arguments
+if [[ $# -eq 0 ]]; then
+	errMsg "No arguments supplied!";
+fi
+#
+if [[ $# -lt 5 ]]; then
+	errMsg "Five mandatory arguments are needed!";
+fi
+
+### CHECK software
+checkIntegrityPipeline table.sh comb_tables.sh
+checkTools  bwa samtools
+######################################################
+
+# ====================================================
+# DEFINE SOME env.VARIABLES
+#dataDIR=DATA
+#dataDIR=data2
+#[ -d $1 ] && dataDIR=$1 || errMsg "1st argument should be a directory, where the reference data to be used is located"
+
+## reference file
+#fastaFILE=$dataDIR/T_thermophila_June2014_assembly.fasta	# <==== this could be another argument!!!
+#
+#faiFILE=$fastaFILE.fai
+
+## target files
+## TO BE READ as arguments... $1 (INPUT) and $2 (IP)
+[ -s $1 ] &&  INPUTfile=`basename $1` || errMsg "2nd argument should be a file with the INPUT reads"
+[ -s $2 ] &&  IPfile=`basename $2` || errMsg "3rd argument should be a file with the IP reads"
+[ -s $3 ] &&  FASTAfile=`basename $3` || errMsg "4th argument should be a file with the reference genome file (fasta)"
+[ -s $4 ] &&  REFfile=`basename $4` || errMsg "5th argument should be an annotation file (gff3)"
+
+faiFILE=$FASTAfile
+fastaFILE=$FASTAfile
+
+inputFILE1=INPUTfile
+inputFILE2=IPfile
+
+# could be RAMdisk or TEMP in a SSD
+# use RAMDISK
+#RAMdisk=/dev/shm/$USER/
+#RAMdisk=/tmp/$USER/
+RunTimeDir=ORF_RACS-$$-`date '+%Y%m%d-%H%M%S'`
+[ -d $5 ] && WORKINGdir=$5/$USER/$RunTimeDir || errMsg "6th argument must be a directory, we suggest using /dev/shm for RAMdisk or /tmp in a SSD device, as this pipeline is quite I/O intense!"
+echo "Working directory --> " $WORKINGdir
+
+
+# number of threads to use with BWA
+cores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l)
+#[ -z $6 ] && NT=$6 || NT=$cores
+if [ " "$6 != " " ]; then NT=$6 ; else NT=$cores; fi
+echo "BWA will use nbrCores="$NT", when possible"
+
+## =====
+
+#inputFILE1=$INPUTfile	#.fastq.gz
+#[ -s $inputFILE1 ] || errMsg $inputFILE1" not found"
+#inputFILE2=$IPfile	#.fastq.gz
+#[ -s $inputFILE2 ] || errMsg $inputFILE2" not found"
+
+# save original directory
+myDIR=$(pwd)
+resultsDIR=$myDIR/ORF_RACS_results-`date '+%Y%m%d-%H%M%S'`	#`date +%D-%T`
+scriptsDIR=`dirname $0`
+
+
+# use RAMDISK
+#RAMdisk=/dev/shm/$USER/
+#RAMdisk=/tmp/$USER/
+
+echo # create local space on RAMDISK
+mkdir -p $WORKINGdir
+# copy data into WORKINGdir
+#cp -pr ./DATA  $RAMdisk
+cp  -prv $1 $WORKINGdir/$inputFILE1
+cp  -prv $2 $WORKINGdir/$inputFILE2
+cp  -prv $3 $WORKINGdir/$FASTAfile
+cp  -prv $4 $WORKINGdir/$REFfile
+# move to WORKINGdir
+cd $WORKINGdir
+
+
+# outputs
+outputINPUTbam=aln$INPUTfile.bam
+outputINPUTbamSORTED=aln$INPUTfile-sorted.bam
+outputINPUTsam=aln$INPUTfile.sam
+#outputIPbam=alnBD1_IP.bam
+#outputIPbamSORTED=alnBD1_IP-sorted.bam
+#outputIPsam=alnBD1_IP.sam
+outputIPbam=aln$IPfile.bam
+outputIPbamSORTED=aln$IPfile-sorted.bam
+outputIPsam=aln$IPfile.sam
+#output3gzip=/dev/shm/$USER/alnBD1_INPUT.tar.gz
+
+# tables files
+tableINPUTs=tableReadsINPUT.`basename $INPUTfile .fastq.gz`
+tableIPs=tableReadsIP.`basename $IPfile .fastq.gz`
+
+####=====================================================
+
+
+## pipeline
+# index the assembly
+# generates 4 aux. files (in principle could be removed!  -- *.fasta.*)
+$BWA index $fastaFILE
+#rm *.fasta.*
+
+# align the INPUT to assembly
+# bwa -> multithreaded... (upto 4)
+$BWA  aln -t $NT  $fastaFILE  $inputFILE1 >  $outputINPUTbam
+
+# convert to sequence coordinate for INPUT
+$BWA samse  $fastaFILE $outputINPUTbam $inputFILE1  >  $outputINPUTsam
+
+# remove inputFILE1 in case of memory issues
+#rm -v $inputFILE1
+
+#######
+# zip file in RAMDISK
+#tar cvzf $output3gzip $output2sam
+######
+
+# align the IP file to the assembly
+# bwa -> multithreaded... (upto 4)
+$BWA  aln -t $NT  $fastaFILE  $inputFILE2  >  $outputIPbam
+
+# convert to sequence coordinate for IP
+$BWA samse  $fastaFILE $outputIPbam $inputFILE2  >  $outputIPsam
+
+# remove inputFILE2 in case of oom
+#rm -v $inputFILE2
+
+
+# FAI index to the assembly
+$SAMTOOLS faidx $fastaFILE
+
+
+# align assembly witht the sam and bam files
+$SAMTOOLS import $faiFILE $outputINPUTsam $outputINPUTbam
+$SAMTOOLS import $faiFILE $outputIPsam $outputIPbam
+
+# sorting for IGV
+$SAMTOOLS sort $outputINPUTbam -o $outputINPUTbamSORTED
+$SAMTOOLS sort $outputIPbam -o $outputIPbamSORTED
+
+$SAMTOOLS index $outputINPUTbamSORTED
+$SAMTOOLS index $outputIPbamSORTED
+
+
+# define table file for extracting genes... passed as third argument to the script!!!
+table=table.$REFfile
+# extract the reads
+touch $tableINPUTs  #tableReadsINPUT
+# read scaffolds from table generated by the "table" script and loop over the sorted BAM files
+# NEEDS the file "table" generated by the script 'table.sh' // if it doesn't exist, launch its creation in parallel
+### [ ! -e $resultsDIR/$table ] || $scriptsDIR/table.sh $REFfile && cp -v $resultsDIR/$table .
+$scriptsDIR/table.sh $REFfile
+[ $? -ne 0 ] && errMsg "' $scriptsDIR/table.sh $REFfile' FAILED...! exitcode $?"
+
+for i in `awk '{print $1}' < $table `; do
+    #echo $i;
+    $SAMTOOLS view $outputINPUTbamSORTED $i | wc -l >> $tableINPUTs  #tableReadsINPUT
+done
+
+touch $tableIPs   #tableReadsIP
+for i in `awk '{print $1}' < $table `; do
+    #echo $i;
+    $SAMTOOLS view $outputIPbamSORTED $i | wc -l >> $tableIPs  #tableReadsIP
+done
+
+
+#remove temporal files...
+rm -v *.fasta.*
+
+### copy data into original directory
+mkdir -p $resultsDIR
+cp $outputINPUTbamSORTED $outputIPbamSORTED  $resultsDIR
+cp $outputINPUTsam $outputIPsam  $resultsDIR
+cp $tableINPUTs  $resultsDIR
+cp $tableIPs  $resultsDIR
+# need the index files for intergenic regions determination
+cp *bai $resultsDIR
+
+# FINAL step, combine tables together...
+#. $scriptsDIR/comb_tables.sh _2_MED2_INPUT_S26_L007_R1_001 _4_MED2_IP_S28_L007_R1_001 T_thermophila_June2014.gff3
+$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile
+[ $? -ne 0 ] && errMsg "'$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile' FAILED...! exitcode $?"
+cp -v FINAL.table.*  $resultsDIR
+
+# clean temp dir used
+rm -rfv $WORKINGdir
diff --git a/core/countReads.sh b/core/countReads.sh
new file mode 100755
index 0000000..8a9b60d
--- /dev/null
+++ b/core/countReads.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+# -vx
+
+# RACS ORF pipeline
+# main script used to analize and call (count) peaks in the T.Thermophila data.
+# This script also calls 2 other scripts: "table.sh" and "comb_tables.sh"
+# The script requires BWA and samtools to be installed as they are used within the pipeline.
+
+#################
+### how to use it
+#
+# 1st argument: file with INPUT reads (.fastq.gz)
+# 2nd argument: file with IP reads (.fastq.gz)
+# 3rd argument: reference genome file (.fasta)
+# 4th argument: annotiation file (.gff3)
+# 5th argument: working space (if possible use RAMdisk --ie. /dev/shm/--, or /tmp in a computer with SSD)
+# 6th argument (optional): number of cores to use for BWA multi-threading
+#
+# Examples:
+# time PATHtoRACSrepo/core/countPeaks.sh   _1_MED1_INPUT_S25_L007_R1_001.fastq.gz  _3_MED1_IP_S27_L007_R1_001.fastq.gz  T_thermophila_June2014_assembly.fasta  T_thermophila_June2014.gff3  /tmp/  16
+# time PATHtoRACSrepo/core/countPeaks.sh   _1_MED1_INPUT_S25_L007_R1_001.fastq.gz  _3_MED1_IP_S27_L007_R1_001.fastq.gz  T_thermophila_June2014_assembly.fasta  T_thermophila_June2014.gff3  /dev/shm/  16
+#
+#################################
+
+
+#######################################################
+# check that the script is not being sourced!!!
+if [[ $0 == '-bash' ]]; then
+	echo "Please do not 'source' this script; ie. run it as PATHtoRACS/core/SCRIPTname arguments"
+	stop
+fi
+
+# setting preamble, detecting scripts location
+scriptsDIR=`dirname $0`
+
+# load auxiliary fns for integrity checks and message/error handling
+if [[ -f $scriptsDIR/auxs/auxFns.sh ]]; then
+        . $scriptsDIR/auxs/auxFns.sh --source-only;
+else
+        echo "Error auxiliary file: " $scriptsDIR/auxs/auxFns.sh "NOT found!"
+        exit
+fi
+#######################################################
+
+#### CHECKS ###########################################
+welcome
+
+### CHECK arguments
+if [[ $# -eq 0 ]]; then
+	errMsg "No arguments supplied!";
+fi
+#
+if [[ $# -lt 5 ]]; then
+	errMsg "Five mandatory arguments are needed!";
+fi
+
+### CHECK software
+checkIntegrityPipeline table.sh comb_tables.sh
+checkTools  bwa samtools
+######################################################
+# define generic names for required programs...
+BWA=$(which bwa)
+SAMTOOLS=$(which samtools)
+
+# ====================================================
+# DEFINE SOME env.VARIABLES
+#dataDIR=DATA
+#dataDIR=data2
+#[ -d $1 ] && dataDIR=$1 || errMsg "1st argument should be a directory, where the reference data to be used is located"
+
+## reference file
+#fastaFILE=$dataDIR/T_thermophila_June2014_assembly.fasta	# <==== this could be another argument!!!
+#
+#faiFILE=$fastaFILE.fai
+
+## target files
+## TO BE READ as arguments... $1 (INPUT) and $2 (IP)
+[ -s $1 ] &&  INPUTfile=`basename $1` || errMsg "2nd argument should be a file with the INPUT reads (fastq.gz)"
+[ -s $2 ] &&  IPfile=`basename $2` || errMsg "3rd argument should be a file with the IP reads (fastq.gz)"
+[ -s $3 ] &&  FASTAfile=`basename $3` || errMsg "4th argument should be a file with the reference genome file (fasta)"
+[ -s $4 ] &&  REFfile=`basename $4` || errMsg "5th argument should be an annotation file (gff3)"
+
+faiFILE=$FASTAfile
+fastaFILE=$FASTAfile
+
+inputFILE1=INPUTfile
+inputFILE2=IPfile
+
+# could be RAMdisk or TEMP in a SSD
+# use RAMDISK
+#RAMdisk=/dev/shm/$USER/
+#RAMdisk=/tmp/$USER/
+RunTimeDir=ORF_RACS-$$-`date '+%Y%m%d-%H%M%S'`
+[ -d $5 ] && WORKINGdir=$5/$USER/$RunTimeDir || errMsg "6th argument must be a directory, we suggest using /dev/shm for RAMdisk or /tmp in a SSD device, as this pipeline is quite I/O intense!"
+echo "Working directory --> " $WORKINGdir
+
+
+# number of threads to use with BWA
+#cores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l)
+cores=`detectCores`
+#[ -z $6 ] && NT=$6 || NT=$cores
+if [ " "$6 != " " ]; then NT=$6 ; else NT=$cores; fi
+echo "BWA will use nbrCores="$NT", when possible"
+
+## =====
+
+#inputFILE1=$INPUTfile	#.fastq.gz
+#[ -s $inputFILE1 ] || errMsg $inputFILE1" not found"
+#inputFILE2=$IPfile	#.fastq.gz
+#[ -s $inputFILE2 ] || errMsg $inputFILE2" not found"
+
+# save original directory
+myDIR=$(pwd)
+resultsDIR=$myDIR/ORF_RACS_results-`date '+%Y%m%d-%H%M%S'`	#`date +%D-%T`
+scriptsDIR=`dirname $0`
+
+
+# use RAMDISK
+#RAMdisk=/dev/shm/$USER/
+#RAMdisk=/tmp/$USER/
+
+echo # create local space on RAMDISK
+mkdir -p $WORKINGdir
+# copy data into WORKINGdir
+#cp -pr ./DATA  $RAMdisk
+cp  -prv $1 $WORKINGdir/$inputFILE1
+cp  -prv $2 $WORKINGdir/$inputFILE2
+cp  -prv $3 $WORKINGdir/$FASTAfile
+cp  -prv $4 $WORKINGdir/$REFfile
+# move to WORKINGdir
+cd $WORKINGdir
+
+
+# outputs
+outputINPUTbam=aln$INPUTfile.bam
+outputINPUTbamSORTED=aln$INPUTfile-sorted.bam
+outputINPUTsam=aln$INPUTfile.sam
+#outputIPbam=alnBD1_IP.bam
+#outputIPbamSORTED=alnBD1_IP-sorted.bam
+#outputIPsam=alnBD1_IP.sam
+outputIPbam=aln$IPfile.bam
+outputIPbamSORTED=aln$IPfile-sorted.bam
+outputIPsam=aln$IPfile.sam
+#output3gzip=/dev/shm/$USER/alnBD1_INPUT.tar.gz
+
+# tables files
+tableINPUTs=tableReadsINPUT.`basename $INPUTfile .fastq.gz`
+tableIPs=tableReadsIP.`basename $IPfile .fastq.gz`
+
+####=====================================================
+
+
+## pipeline
+# index the assembly
+# generates 4 aux. files (in principle could be removed!  -- *.fasta.*)
+$BWA index $fastaFILE
+#rm *.fasta.*
+
+# align the INPUT to assembly
+# bwa -> multithreaded... (upto 4)
+$BWA  aln -t $NT  $fastaFILE  $inputFILE1 >  $outputINPUTbam
+
+# convert to sequence coordinate for INPUT
+$BWA samse  $fastaFILE $outputINPUTbam $inputFILE1  >  $outputINPUTsam
+
+# remove inputFILE1 in case of memory issues
+#rm -v $inputFILE1
+
+#######
+# zip file in RAMDISK
+#tar cvzf $output3gzip $output2sam
+######
+
+# align the IP file to the assembly
+# bwa -> multithreaded... (upto 4)
+$BWA  aln -t $NT  $fastaFILE  $inputFILE2  >  $outputIPbam
+
+# convert to sequence coordinate for IP
+$BWA samse  $fastaFILE $outputIPbam $inputFILE2  >  $outputIPsam
+
+# remove inputFILE2 in case of oom
+#rm -v $inputFILE2
+
+
+# FAI index to the assembly
+$SAMTOOLS faidx $fastaFILE
+
+
+# align assembly witht the sam and bam files
+$SAMTOOLS import $faiFILE $outputINPUTsam $outputINPUTbam
+$SAMTOOLS import $faiFILE $outputIPsam $outputIPbam
+
+# sorting for IGV
+$SAMTOOLS sort $outputINPUTbam -o $outputINPUTbamSORTED
+$SAMTOOLS sort $outputIPbam -o $outputIPbamSORTED
+
+$SAMTOOLS index $outputINPUTbamSORTED
+$SAMTOOLS index $outputIPbamSORTED
+
+
+# define table file for extracting genes... passed as third argument to the script!!!
+table=table.$REFfile
+# extract the reads
+touch $tableINPUTs  #tableReadsINPUT
+# read scaffolds from table generated by the "table" script and loop over the sorted BAM files
+# NEEDS the file "table" generated by the script 'table.sh' // if it doesn't exist, launch its creation in parallel
+### [ ! -e $resultsDIR/$table ] || $scriptsDIR/table.sh $REFfile && cp -v $resultsDIR/$table .
+$scriptsDIR/table.sh $REFfile
+[ $? -ne 0 ] && errMsg "' $scriptsDIR/table.sh $REFfile' FAILED...! exitcode $?"
+
+for i in `awk '{print $1}' < $table `; do
+    #echo $i;
+    $SAMTOOLS view $outputINPUTbamSORTED $i | wc -l >> $tableINPUTs  #tableReadsINPUT
+done
+
+touch $tableIPs   #tableReadsIP
+for i in `awk '{print $1}' < $table `; do
+    #echo $i;
+    $SAMTOOLS view $outputIPbamSORTED $i | wc -l >> $tableIPs  #tableReadsIP
+done
+
+
+#remove temporal files...
+rm -v *.fasta.*
+
+### copy data into original directory
+mkdir -p $resultsDIR
+cp $outputINPUTbamSORTED $outputIPbamSORTED  $resultsDIR
+cp $outputINPUTsam $outputIPsam  $resultsDIR
+cp $tableINPUTs  $resultsDIR
+cp $tableIPs  $resultsDIR
+# need the index files for intergenic regions determination
+cp *bai $resultsDIR
+
+# FINAL step, combine tables together...
+#. $scriptsDIR/comb_tables.sh _2_MED2_INPUT_S26_L007_R1_001 _4_MED2_IP_S28_L007_R1_001 T_thermophila_June2014.gff3
+$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile
+[ $? -ne 0 ] && errMsg "'$scriptsDIR/comb_tables.sh $INPUTfile $IPfile $REFfile' FAILED...! exitcode $?"
+cp -v FINAL.table.*  $resultsDIR
+
+# clean temp dir used
+rm -rfv $WORKINGdir
diff --git a/core/intergenic/det-interGenes-ORIG.sh b/core/intergenic/det-interGenes-ORIG.sh
new file mode 100755
index 0000000..12d6123
--- /dev/null
+++ b/core/intergenic/det-interGenes-ORIG.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# det-interGenes.sh
+# main script in charge of determining the intergenic regions
+# this part of the pipeline requires the table generated by the ORF reads/counts
+#
+#
+# HOW TO USE THIS SCRIPT:
+#	arg1: final combined table gnerated by the ORF from the RACS pipeline
+#	arg2: reference genome file (gff3)
+#	arg3: name of the file where to save the InterGenic Regions
+#	arg4/5:	name of the output file from ORF part/tag.(ref.) name
+#	arg6/7: ...
+#	...
+#
+# eg.
+#    det-interGenes.sh  combinedTABLES_MED1-MED2  dataset/T_thermophila_June2014.sorted.gff3  interGENs_MED1-MED2.csv
+#				Fillingham_1_MED1_INPUT_S25_L007_R1_001  INPUTm1	\
+#				Fillingham_2_MED2_INPUT_S26_L007_R1_001  INPUTm2	\
+#				Fillingham_3_MED1_IP_S27_L007_R1_001  IPm1	\
+#				Fillingham_4_MED2_IP_S28_L007_R1_001  IPm2	\
+#				...
+#
+
+#################################################
+# Setting preamble, detecting scripts location
+scriptsDIR=`dirname $0`
+
+#################################################
+# load auxiliary fns for integrity checks and message/error handling
+. $scriptsDIR/../auxs/auxFns.sh --source-only
+
+#### CHECKS #####################################
+### CHECK arguments
+if [[ $# -eq 0 ]]; then
+	errMsg "No arguments were supplied!";
+fi
+#
+
+### INTEGRITY CHECK
+# check RACS scripts
+checkIntegrityPipeline  intergenic/det-interGenes.sh  intergenic/interGeneRegions.R  intergenic/utils_RACS-IGR.R  intergenic/interGenes.sh
+# check external tools needed
+checkTools samtools Rscript
+#################################################
+
+# getting command line arguments
+
+combTABLE=$1    	#combinedTABLES_MED1-MED2
+refFILE=$2		#dataset/T_thermophila_June2014.sorted.gff3
+interGENregions=$3      #interGENs_MED1-MED2.csv  -- output from Rscript
+#
+INPUTfile1=$4	#Fillingham_1_MED1_INPUT_S25_L007_R1_001
+tag1=$5		#INPUTm1
+INPUTfile2=$6	#Fillingham_2_MED2_INPUT_S26_L007_R1_001
+tag2=$7		#INPUTm2
+INPUTfile3=$8	#Fillingham_3_MED1_IP_S27_L007_R1_001
+tag3=$9		#IPm1
+INPUTfile4=${10}	#Fillingham_4_MED2_IP_S28_L007_R1_001
+tag4=${11}	#IPm2
+#
+combinedFinalIGenicTable=${interGENregions}_INPUTs-IPs`date '+%Y%m%d-%H%M%S'`.csv
+# ================================================
+
+# step #0: look into combTABLE and generate list of intergenic regions
+t0=$(time Rscript  $scriptsDIR/interGeneRegions.R   $combTABLE  $refFILE  $interGENregions)
+
+# process each of the samples in comparison to the intergenic regions just generated in 'step #0'
+t1=$(time  $scriptsDIR/interGenes.sh  $tag1  $INPUTfile1  $interGENregions)
+#t1=$(time . $scriptsDIR/interGenes.sh  INPUTm1 Fillingham_1_MED1_INPUT_S25_L007_R1_001  interGENs_MED1-MED2.csv )
+t2=$(time  $scriptsDIR/interGenes.sh  $tag2  $INPUTfile2  $interGENregions)
+#t2=$(time . $scriptsDIR/interGenes.sh  INPUTm2 Fillingham_2_MED2_INPUT_S26_L007_R1_001  interGENs_MED1-MED2.csv )
+t3=$(time  $scriptsDIR/interGenes.sh  $tag3  $INPUTfile3  $interGENregions)
+#t3=$(time . $scriptsDIR/interGenes.sh  IPm1 Fillingham_3_MED1_IP_S27_L007_R1_001  interGENs_MED1-MED2.csv )
+t4=$(time  $scriptsDIR/interGenes.sh  $tag4  $INPUTfile4  $interGENregions)
+#t4=$(time . $scriptsDIR/interGenes.sh  IPm2 Fillingham_4_MED2_IP_S28_L007_R1_001  interGENs_MED1-MED2.csv )
+
+# combine tables with INPUT/IP reads
+#t5=$(time paste interGENs_MED1-MED2.csv interGENs-Fillingham_1_MED1_INPUT_S25_L007_R1_001 interGENs-Fillingham_3_MED1_IP_S27_L007_R1_001 interGENs-Fillingham_2_MED2_INPUT_S26_L007_R1_001 interGENs-Fillingham_4_MED2_IP_S28_L007_R1_001 > interGENs_MED1-MED2_INPUTs-IPs.csv )
+t5=$(time paste  $interGENregions  interGENs-$INPUTfile1  interGENs-$INPUTfile3  interGENs-$INPUTfile2  interGENs-$INPUTfile4  >  $combinedFinalIGenicTable)
+
+# statistics on running times
+echo "Partial Times: $t0, $t1, $t2, $t3, $t4"
+echo Total time: $t0+($t1+$t2+$t3+$t4)+$t5
+
+
diff --git a/core/intergenic/det-interGenes.sh b/core/intergenic/det-interGenes.sh
new file mode 100755
index 0000000..1703f59
--- /dev/null
+++ b/core/intergenic/det-interGenes.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# det-interGenes.sh	---	RACS IGR pipeline
+# main script in charge of determining the intergenic regions
+# this part of the pipeline requires the table generated by the ORF reads/counts
+#
+#
+# HOW TO USE THIS SCRIPT:
+#	arg1: final combined table gnerated by the ORF from the RACS pipeline
+#	arg2: reference genome file (gff3)
+#	arg3: name of the file where to save the InterGenic Regions
+#	arg4: text file containg the name of the output file from ORF part/tag.(ref.) name; eg.
+#       	alnGreenblatt_1_SSCH1-1_ChIP_S81_R1_001.fastq.gz-sorted.bam
+#		alnGreenblatt_1_SSCH1-1_ChInput_S82_R1_001.fastq.gz-sorted.bam
+#
+#		The aln*.fastq.g-sorted.bam files should have been generated with the ORF part of the RACS pipeline!
+#
+# eg.
+#    PATHtoRACSrepo/core/intergenic/det-interGenes.sh  combinedTABLES_MED1-MED2  dataset/T_thermophila_June2014.sorted.gff3  interGENs_MED1-MED2.csv  samples.file
+#
+
+#################################################
+
+# Setting preamble, detecting scripts location
+scriptsDIR=`dirname $0`
+
+#################################################
+# load auxiliary fns for integrity checks and message/error handling
+if [[ -f $scriptsDIR/../auxs/auxFns.sh ]]; then
+	. $scriptsDIR/../auxs/auxFns.sh --source-only;
+else
+	echo "Error auxiliary file: " $scriptsDIR/../auxs/auxFns.sh "NOT found!"
+	exit
+fi
+
+# display RACS welcome/credit message
+welcome
+
+
+#### CHECKS #####################################
+### INTEGRITY CHECKs
+# check RACS scripts for the IGR routines
+checkIntegrityPipeline  det-interGenes.sh  \
+			interGeneRegions.R  utils_RACS-IGR.R \
+			interGenes.sh
+
+# check external tools needed
+checkTools samtools Rscript
+
+################################################
+
+### CHECK arguments
+if [[ $# -eq 0 ]]; then
+	errMsg "No arguments were supplied!";
+fi
+#
+case $# in
+	4) echo $# "Arguments received:"; echo $@;;
+	*) usage ;;
+esac
+
+###########
+
+# getting command line arguments
+# MANDATORY arguments
+combTABLE=$1    	#combinedTABLES_MED1-MED2
+refFILE=$2		#dataset/T_thermophila_June2014.sorted.gff3
+interGENregions=$3      #interGENs_MED1-MED2.csv  -- output from Rscript
+SAMPLES=$4	# input.samples: text file containing the name of the sample files to process, eg.
+		#	alnGreenblatt_1_SSCH1-1_ChIP_S81_R1_001.fastq.gz-sorted.bam
+		#
+		#	alnFillingham_1_MED1_INPUT_S25_L007_R1_001
+		#	alnFillingham_2_MED2_INPUT_S26_L007_R1_001
+		#	alnFillingham_3_MED1_IP_S27_L007_R1_001
+		#	alnFillingham_4_MED2_IP_S28_L007_R1_001
+#
+
+# check inut arguments, ie. that the files indicated actually exist!
+checkFile $combTABLE $refFILE $SAMPLES
+
+
+combinedFinalIGenicTable=${interGENregions}_INPUTs-IPs`date '+%Y%m%d-%H%M%S'`.csv
+# ================================================
+
+# step #0: look into combTABLE and generate list of intergenic regions
+t0=$(time Rscript  $scriptsDIR/interGeneRegions.R   $combTABLE  $refFILE  $interGENregions)
+
+Ttot=$((t0))
+files=$(cat $SAMPLES)
+echo "TARGETS: " $files
+for file in $files; do
+	echo 'processing sample file:' $file
+	# process each of the samples in comparison to the intergenic regions just generated in 'step #0'
+	t1=$(time $scriptsDIR/interGenes.sh  $file-`date '+%Y%m%d-%H%M%S'`  $file  $interGENregions)
+	# Add individual times...
+	#Ttot=$((Ttot+t1))
+done
+
+
+# combine tables with INPUT/IP reads
+#t5=$(time paste interGENs_MED1-MED2.csv interGENs-Fillingham_1_MED1_INPUT_S25_L007_R1_001 interGENs-Fillingham_3_MED1_IP_S27_L007_R1_001 interGENs-Fillingham_2_MED2_INPUT_S26_L007_R1_001 interGENs-Fillingham_4_MED2_IP_S28_L007_R1_001 > interGENs_MED1-MED2_INPUTs-IPs.csv )
+###>>>>>>
+t5=$(time paste  $interGENregions  `cat $SAMPLES | awk '{print "interGENs-"$1".csv"}'`  >  $combinedFinalIGenicTable)
+
+# statistics on running times
+echo Total time: $((Ttot+t5))
+
+###########################################################################
diff --git a/core/intergenic/interGeneRegions.R b/core/intergenic/interGeneRegions.R
new file mode 100755
index 0000000..95cdff0
--- /dev/null
+++ b/core/intergenic/interGeneRegions.R
@@ -0,0 +1,176 @@
+#########################################################################
+# RACS: InterGenic Regions -- main driver R script
+#
+# requires "utils_RACS-IGR.R" utilties file with fn defns
+#
+# part of the det-intergenic.sh pipeline
+# used for determing the intergenic regions
+#
+#
+# HOW TO USE this script:
+#
+# Rscript interGeneRegions.R inputFile refFILE.gff3 outFile
+#
+#
+########################################################################
+
+#
+# Function to establish the location of the script... hence the relative path to the utilities file
+# this function needs to be in the main driver script!
+#
+
+ALLargs <- function() {
+    cmdArgs <- commandArgs(trailingOnly = FALSE)
+    scrPATHkw <- "--file="
+    match <- grep(scrPATHkw, cmdArgs)
+    if (length(match) > 0) {
+        # Rscript
+        scriptLOC <- dirname(normalizePath(sub(scrPATHkw, "", cmdArgs[match])))
+    } else {
+        # 'source'd via R console
+        scriptLOC <- normalizePath(sys.frames()[[1]]$ofile)
+    }
+
+    cmdArgs <- commandArgs(trailingOnly =TRUE)
+    return(list(scriptLOC,cmdArgs))
+}
+
+
+
+allArgs <- ALLargs()
+
+print(allArgs)
+
+########################################################################
+########################################################################
+cat(" RACS v1.0 (2018/2019) -- InterGenic Regions Finder",'\n')
+cat("----------------------------------------------------",'\n')
+#######################################################################
+########################################################################
+# load utilities file with fns. defns.
+
+utilitiesFile <- paste(allArgs[[1]],"/utils_RACS-IGR.R",sep='')
+print(utilitiesFile)
+#print(dirname(sys.frame(1)$ofile)
+#sourceDir <- getSrcDirectory(function(dummy) {dummy})
+#print(sourceDir)
+
+
+if (file.exists(utilitiesFile)) {
+	source(utilitiesFile)
+} else {
+	stop("Critical ERRROR: '",utilitiesFile,"' NOT found!")
+}
+
+########################################################################
+
+################################
+
+
+# read files...
+files2process <- CLArgs()
+
+inputDATA <- readDATA(files2process[1])
+
+
+refTable <- readRefTable(files2process[2])
+
+
+sortedDATA <- reshapeTable(inputDATA)
+
+nbrEntries <- length(sortedDATA$lscaffold)
+
+
+
+#for (i in c(1:nbrEntries)) {
+#    #a <- sortedDATA[i,][1]	#levels(sortedDATA[i,]$lscaffold)[i]
+#    a <-  levels(sortedDATA$lscaffold)[sortedDATA$lscaffold][i]
+#    aa <- sortedDATA[i,]$lbregion
+#    aaa <- sortedDATA[i,]$leregion
+#    cat(a,aa,aaa,'\n')
+#}
+
+lstscfld <- c()
+lstregion1 <- c()
+lstregion2 <- c()
+lstSize <- c()
+#nbrEntries <- 20
+
+SCFLD <- ""
+for (i in c(1:nbrEntries)) {
+    scaffold <- levels(sortedDATA$lscaffold)[sortedDATA$lscaffold][i]
+    begReg <- sortedDATA[i,]$lbregion
+    endReg  <- sortedDATA[i,]$leregion
+    cat(i, scaffold,begReg,endReg,'\n')
+    if (SCFLD == scaffold) { 	#same scaffold
+	#if ((lstscfld[length(lstscfld)]==scaffold)&(begReg<lstregion2[length(lstscfld)])) {
+#	 if (begReg>lstregion1[try(length(lstscfld))]) {
+#	if (begReg>endReg) {
+	if  (begReg > sortedDATA[i-1,]$leregion) {
+       beginRegion <- begReg
+       #dumpData(SCFLD,endRegion,beginRegion, lstscfld,lstregion1,lstregion2,lstSize, 1)
+       myScfld <- dumpData(SCFLD,endRegion,beginRegion, 1)
+        lstscfld <- c(lstscfld, myScfld[1])
+        lstSize <- c(lstSize, myScfld[4])
+        lstregion1 <- c(lstregion1,endRegion+1)
+        lstregion2 <- c(lstregion2,beginRegion-1)
+       endRegion <- endReg
+	} else {	# will to skip this region...
+		print(">>>>>>>>>>>>>>>> SUSPICIOUS OVERLYING REGIONs!!!  <<<<<<<<<<<<<<<<")
+		cat(scaffold,begReg,endReg,'\n')
+		cat(try(lstscfld[length(lstscfld)-1]),try(lstregion1[length(lstscfld)-1]),try(lstregion2[length(lstscfld)-1]),'\n')
+		#stop
+		}
+    } else { 	# change scaffold
+      if (SCFLD != "") {
+         beginRegion <- begReg
+         #dumpData(SCFLD,endRegion,'xxx', lstscfld,lstregion1,lstregion2,lstSize, 1)
+         scfCAP <- scfldCAP(SCFLD,refTable)
+         if (endRegion < scfCAP ) {	# dealing with cases where the gene ends at the limit of the scaffold...
+         myScfld <- dumpData(SCFLD,endRegion,scfCAP+1, 1)
+          lstscfld <- c(lstscfld, myScfld[1])
+          lstSize <- c(lstSize, myScfld[4])
+          lstregion1 <- c(lstregion1,endRegion+1)
+          lstregion2 <- c(lstregion2,scfCAP)
+         } else {
+		print(">>>>>>>>>>>>>>>> SUSPICIOUS OVERLYING REGIONs!!!  <<<<<<<<<<<<<<<<")
+		cat(scaffold,begReg,endReg,'\n')
+		cat(try(lstscfld[length(lstscfld)-1]),try(lstregion1[length(lstscfld)-1]),try(lstregion2[length(lstscfld)-1]),'\n')
+		#stop
+           }
+         #dumpData(scaffold,'0',beginRegion, lstscfld,lstregion1,lstregion2,lstSize, 1)
+         myScfld <- dumpData(scaffold,'0',beginRegion, 1)
+          lstscfld <- c(lstscfld, myScfld[1])
+          lstSize <- c(lstSize, myScfld[4])
+          lstregion1 <- c(lstregion1,0+1)
+          lstregion2 <- c(lstregion2,beginRegion-1)
+         endRegion <- endReg
+      } else { 	# first scaffold ever ...
+              beginRegion <- begReg	#strsplit(region,'-')[[1]][1]
+              endRegion <- endReg	#strsplit(region,'-')[[1]][2]
+              #dumpData(scaffold,'0',beginRegion, lstscfld,lstregion1,lstregion2,lstSize, 1)
+              myScfld <- dumpData(scaffold,'0',beginRegion, 1)
+               lstscfld <- c(lstscfld, myScfld[1])
+               lstSize <- c(lstSize, myScfld[4])
+               lstregion1 <- c(lstregion1,0+1)
+               lstregion2 <- c(lstregion2,beginRegion-1)
+         }
+      SCFLD <- scaffold
+    }
+}
+# take care of last case...
+#dumpData(scaffold,endRegion,'xxx', lstscfld,lstregion1,lstregion2,lstSize, 1)
+ scfCAP <- scfldCAP(scaffold,refTable)
+ myScfld <- dumpData(scaffold,endRegion,scfCAP+1, 1)
+  lstscfld <- c(lstscfld, myScfld[1])
+  lstSize <- c(lstSize, myScfld[4])
+ lstregion1 <- c(lstregion1,endRegion+1)
+ lstregion2 <- c(lstregion2,scfCAP)
+
+interGenes <- data.frame(lstscfld,lstregion1,lstregion2,lstSize)
+names(interGenes) <- c("scaffold",'beggining','end','size')
+saveDATA(interGenes,files2process[3])
+
+
+
+
diff --git a/core/intergenic/interGenes.sh b/core/intergenic/interGenes.sh
new file mode 100755
index 0000000..9edc799
--- /dev/null
+++ b/core/intergenic/interGenes.sh
@@ -0,0 +1,73 @@
+# interGenes.sh	---	RACS pipeline
+#
+# shell script part of the RACS InterGenic Regions determination
+# this script is part of the IGR RACS tool, use "det-interGenes.sh" instead!
+# this script should receive 3 arguments
+#
+# arg1: a label to annotate the produced files
+# arg2: alnXXXX.fastqc.gz-sorted.bam file produced when running the ORF RACS pipeline tool
+# arg3: table contanining the IGR
+#
+# Eg. call the script with BDx_{INPUT/IP}_NSG  interGENs.csv
+##  . ../newSCRIPTS/interGenes.sh  INPUTm1  alnFillingham_1_MED1_INPUT_S26_L007_R1_001.fastq.gz-sorted.bam  interGENs_MED1-MED2.csv
+#
+##################################################
+# setting preamble, detecting scripts location
+scriptsDIR=`dirname $0`
+
+
+# load auxiliary fns for integrity checks and message/error handling
+if [[ -f $scriptsDIR/../auxs/auxFns.sh ]]; then
+        . $scriptsDIR/../auxs/auxFns.sh --source-only;
+else
+	echo "Error auxiliary file: " $scriptsDIR/../auxs/auxFns.sh "NOT found!"
+	exit
+fi
+#################################################
+
+# display RACS welcome message
+welcome
+
+#### CHECKS
+### INTEGRITY CHECKs
+# check external tools needed
+checkTools samtools
+
+#################################################
+
+echo 'hello'
+echo $#
+echo $@
+
+### CHECK arguments
+case $# in
+	3) echo $# "Arguments received:"; echo $@;;
+	*) errMsg "Invalid number of arguments!"; usage ;;
+esac
+
+#########
+
+HEADER=$1
+src=$2
+#bamSORTEDfiles=aln$src-sorted.bam
+bamSORTEDfiles=$2
+interGENregions=$3
+table_interGENs=interGENs-$src.csv	#--`date '+%Y%m%d-%H%M%S'`.csv
+
+# check arguments: verify that input files exist...
+checkFile $interGENregions $bamSORTEDfiles
+
+# check if there was a previous version of IGR table, and if so renamed it
+# [[ -s $table_interGENs ]] && mv -v  $table_interGENs  $table_interGENs-PREVIOUS_`date '+%Y%m%d-%H%M%S'`.csv
+
+# generate header
+echo $table_interGENs"::"$HEADER > $table_interGENs
+
+
+# count reads ...
+for i in `awk '{ if (NR>1) print $1}' < $interGENregions `; do
+    #echo $i;
+    samtools view $bamSORTEDfiles $i | wc -l >> $table_interGENs
+done
+
+####################################################################
diff --git a/core/intergenic/utils_RACS-IGR.R b/core/intergenic/utils_RACS-IGR.R
new file mode 100755
index 0000000..bbaf91b
--- /dev/null
+++ b/core/intergenic/utils_RACS-IGR.R
@@ -0,0 +1,225 @@
+# RACS pipeline -- interGENIC regions
+# utilities file for interGeneRegions.R script
+
+
+#########################################################################
+
+reshapeTable <- function(origDATA){
+
+lscaffold <- c()
+lbregion <- c()
+leregion <- c()
+
+# reshape the table to FIX order...
+for (i in origDATA$geneSCFFLD) {
+    lscaffold <- c(lscaffold, strsplit(i,':')[[1]][1])
+    region <- strsplit(i,':')[[1]][2]
+    lbregion <- c(lbregion, as.numeric(strsplit(region,'-')[[1]][1]))
+    leregion <- c(leregion, as.numeric(strsplit(region,'-')[[1]][2]))
+}
+tmpTable <- data.frame(lscaffold,lbregion,leregion)
+sortedTABLE <- tmpTable[order(tmpTable$lscaffold,tmpTable$lbregion),]
+
+return(sortedTABLE)
+}
+#######################################################################
+
+
+#########################################################################
+
+dumpData <- function(scaffold,region1,region2, flag) {
+   r1 <- as.numeric(region1)+1
+   if (region2 != 'xxx') {
+       r2 <- as.numeric(region2)-1
+       geneSz <- r2-r1+1
+      } else {
+          r2 <- region2
+          geneSz <- 'XXX'
+        }
+
+   strng0 <- paste(scaffold,':',r1,'-',r2,sep='')
+   strng1 <- paste(strng0,'\t',geneSz,sep='')
+   #lstscfld <<- c(lstscfld, strng0)
+   #lstreg1 <<- c(lstreg1, r1)
+   #lstreg2 <<- c(lstreg2, r2)
+   #lstsize <<- c(lstsize, geneSz)
+
+   if (flag==1) {
+      cat(strng1)
+      cat('\n')
+   }
+
+   return(c(strng0,r1,r2,geneSz))
+}
+
+########################################################################
+
+
+########################################################################
+########################################################################
+########################################################################
+
+# OLD intergene regions generating function
+interGenesRegionGen_OLD <- function(srtedTABLE) {
+SCFLD <- ""
+for (i in srtedTABLE$lscaffold) {
+    scaffold <- i       #strsplit(i,':')[[1]][1]        #substr(i,1,12)
+    region <- strsplit(i,':')[[1]][2]
+    print(i)
+    if (SCFLD == scaffold) {    #same scaffold
+       beginRegion <- strsplit(region,'-')[[1]][1]
+       #print(i)
+       cat(SCFLD,':',as.numeric(endRegion)+1,'-',as.numeric(beginRegion)-1,'\t',as.numeric(beginRegion)-as.numeric(endRegion))
+       cat('\n')
+       endRegion <- strsplit(region,'-')[[1]][2]
+    } else {    # change scaffold
+       if (SCFLD!='') {
+           #print(i)
+           beginRegion <- strsplit(region,'-')[[1]][1]
+           dumpData(SCFLD,endRegion,'xxx')
+           dumpData(scaffold,'0',beginRegion)
+           endRegion <- strsplit(region,'-')[[1]][2]
+       } else { # first scaffold ever...
+              beginRegion <- strsplit(region,'-')[[1]][1]
+              endRegion <- strsplit(region,'-')[[1]][2]
+              dumpData(scaffold,'0',beginRegion)
+         }
+      SCFLD <- scaffold
+    }
+}
+# take care of last case...
+dumpData(scaffold,endRegion,'xxx')
+}
+
+########################################################################
+########################################################################
+########################################################################
+
+
+########################################################################
+
+
+
+
+
+########################################################################
+
+# function to dump the new data generated in a CSV file
+saveDATA <- function(data,fileName) {
+    write.table(data, file=fileName, sep='\t', row.names=FALSE, quote=FALSE)
+}
+
+########################################################################
+# Help/Description Fn
+errMsgFn <- function(...){
+
+	cat("RACS: intergenic region determination Rscript",'\n')
+	cat("----------------------------------------------------",'\n')
+	cat("Attention! This script requires 3 arguments!",'\n')
+	cat('\t'," i: input file where to read the combined tables from",'\n')
+	cat('\t'," ii: reference *gff3* genome file for the organism, eg. 'T_thermophila_June2014.sorted.gff3'", '\n')
+	cat('\t'," iii: name of the generated intergenic files, eg. 'interGENs.csv'", '\n')
+	cat("---------------------------------------------------",'\n\n')
+	cat(paste(...,sep=''),'\n')
+	stop()
+}
+
+
+# Function to process command line arguments
+CLArgs <- function(def.ref.file="DATA/T_thermophila_June2014.sorted.gff3",def.out.name='interGENs.csv') {
+
+	checkFile <- function(fileN) {
+	   if (file.exists(fileN) == FALSE) {
+		errMsgFn("Error: '",fileN,"' NOT found!")
+	   }
+	}
+
+# read command-line arguments, expecting three filenames:
+#	 i) to read, ii) ref gff3 file for organism, iii) to save results
+
+	args <- commandArgs(trailingOnly=TRUE)
+
+	len.args <- length(args)
+	if (len.args < 1) errMsgFn("Error: this scripts requires 3 arguments!")
+
+	# process 1st argument
+	inputFile <- args[1]    # eg. "combinedTABLES_BD1-BD2--SORTED"
+	checkFile(inputFile)
+
+	if (len.args == 3) {
+		refFile <- args[2]	# eg. "DATA/T_thermophila_June2014.sorted.gff3"
+		checkFile(refFile)
+		outFile <- args[3]
+	} else if (len.args == 2 ) {
+		   refFile <- args[2]
+		   checkFile(refFile)
+		   cat("Assuming default name for output file... '",def.out.name,"'",'\n')
+		   outFile <- def.out.name
+	       } else {
+			errMsgFn("Error: incorrect numnber of arguments!")
+		      }
+
+	return(c(inputFile,refFile,outFile))
+}
+
+
+# Function for selecting the input data from a file....
+readDATA <- function(filename) {
+
+## read DATA
+#combinedDATA=read.csv("~/Downloads/combinedTABLES_BD1-BD2--SORTED",header=F,sep='\t')
+#combinedDATA=read.csv("./TABLEE",header=F,sep='\t')
+#inputDATA <- read.csv(filename, header=F, sep='\t')
+
+# latest version of RACS-ORF will generate final version of the TABLESwith headers
+inputDATA <- read.csv(filename, header=TRUE, sep='')
+
+# give names to the columns...
+names(inputDATA)[1]<-"geneSCFFLD"
+#names(combinedDATA)[3]<-"Note"
+#names(combinedDATA)[4]<-"geneSize"
+#names(inputDATA)[5]<-"BD1readsINPUT"
+#names(inputDATA)[6]<-"BD1readsIP"
+#names(combinedDATA)[7]<-"normBD1readsINPUT"
+#names(combinedDATA)[8]<-"normBD1readsIP"
+#names(combinedDATA)[9]<-"BD1score"
+#names(inputDATA)[10]<-"BD2readsINPUT"
+#names(inputDATA)[11]<-"BD2readsIP"
+#names(combinedDATA)[12]<-"normBD2readsINPUT"
+#names(combinedDATA)[13]<-"normBD2readsIP"
+#names(combinedDATA)[14]<-"BD2score"
+#names(combinedDATA)[15]<-"BD1BD2enr"
+
+return(inputDATA)
+}
+#########################################################################
+readRefTable <- function(refFile) {
+
+refTableOrig <- read.csv(refFile, header=FALSE, sep='\t')
+
+refTable  <- data.frame(refTableOrig[refTableOrig$V3=='supercontig',]$V1, refTableOrig[refTableOrig$V3=='supercontig',]$V5)
+names(refTable) <- c("scaffold","supercontig")
+
+return(refTable)
+}
+
+#########################################################################
+
+scfldCAP <- function(scfld,refTable) {
+   return(refTable[refTable$scaffold==scfld,]$supercontig)
+}
+
+#########################################################################
+
+warningSCFLD <- function(scfld,begReg,endReg) {
+	print(">>>>>>>>>>>>>>>> SUSPICIOUS OVERLYING REGIONs!!!  <<<<<<<<<<<<<<<<")
+	cat(scaffold,begReg,endReg,'\n')
+	cat(try(lstscfld[length(lstscfld)-1]),try(lstregion1[length(lstscfld)-1]),try(lstregion2[length(lstscfld)-1]),'\n')
+	#stop
+}
+
+########################################################################
+
+
+
+
diff --git a/core/normalizeORF.sh b/core/normalizeORF.sh
new file mode 100755
index 0000000..fe9e622
--- /dev/null
+++ b/core/normalizeORF.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# normalizeORF.sh	---     RACS pipeline
+#
+# Script utilized to normalize the reads from the ORF
+#
+#
+###
+### How to use this script:
+#
+# This script requires 3 arguments:
+#
+#   - 1st argument: "FINAL.table.*"  file from RACS' ORF pipeline
+#   - 2nd argument: "PF-INPUT-value"  PF value correspoding to the INPUT file
+#   - 3rd argument: "PF-IP-value"  PF value correspoding to the IP file
+#   - 4th argument: 'A' or 'D' (OPTIONAL), when this 4th argument is specified, an additiona table is created being ordered with respect to the IP/INPUT ratio, in "A"scending or "D"ecreasing order
+#
+#
+# Example:
+# 	PATHtoRACS/core/normalizeORF.sh  FINAL.table.XXXX  14694464  10148171
+#       PATHtoRACS/core/normalizeORF.sh  FINAL.table.XXXX  14694464  10148171  A
+#
+#
+#################################
+
+
+# setting preamble, detecting scripts location
+scriptsDIR=`dirname $0`
+
+# load auxiliary fns for integrity checks and message/error handling
+if [[ -f $scriptsDIR/auxs/auxFns.sh ]]; then
+	. $scriptsDIR/auxs/auxFns.sh --source-only;
+else
+	echo "Error auxiliary file: " $scriptsDIR/auxs/auxFns.sh "NOT found!"
+	exit
+fi
+
+#### CHECKS ###########################################
+welcome
+### CHECK arguments
+if [[ $# -eq 0 ]]; then
+        errMsg "No arguments supplied!";
+fi
+#
+if [[ $# -lt 3 ]]; then
+	errMsg "Three mandatory arguments are needed!";
+fi
+#######
+[ -s $1 ] &&  INPUTfile=`basename $1` || errMsg "1st argument should be the FINAL.table.* file coming from the RACS' ORF pipeline"
+#######
+
+# Check values for PF Clusters scores
+checkNbr $2 "Second"
+checkNbr $3 "Third"
+
+# Check whether the user wants to generate an additional ordered table
+if [[ $# -eq 4 ]]; then
+	checkOption $4 "Fourth (optional)";
+elif [[ $# -gt 4 ]]; then
+	errMsg "This script can accept only four arguments!"
+fi
+
+
+#######
+
+######################################################
+
+# define name of the new table with normalized entries
+normTABLE=normalized--$1
+
+# process the file, adding normalizations...
+# first time skip first line from the FINAL.table to avoid its header...
+# a second awk is used to avoid division.by.zero, will populate with -- instead
+awk -v pfINPUT="$2" -v pfIP="$3" 'NR > 1 {print $0"\t"$5/pfINPUT"\t"$6/pfIP}{next}' $1 | awk '{ if ($7==0) print $0"\t""--"; else print $0"\t"($8/$7)}' > $normTABLE
+
+# let's sort the normalized table by the ratio...
+if [[ $4 == "A" ]]; then
+	echo "sorting normalized table in 'A'scending order...";
+	sort -k 9 $normTABLE > $normTABLE--SORTED;
+elif [[ $4 == "D" ]]; then
+	echo "sorting normalized table in 'D'esceding order...";
+	sort -k 9 -r $normTABLE > $normTABLE--SORTED;
+fi
+
+# Add header to the new files
+header=`head -1 $1`" \t normalized.INPUT \t normalized.IP \t normalized.IP.INPUT.ratio"
+
+if [[ $4 -eq "A" || $4 -eq "D" ]]; then
+	TABLES=$normTABLE--SORTED" "$normTABLE;
+else
+	TABLES=$normTABLE;
+fi
+
+echo $TABLES
+for table in $TABLES; do
+	echo $table;
+	echo -e $header | cat - $table > tmp && mv tmp $table
+done
+
+
+#######################################################################################
diff --git a/core/table.sh b/core/table.sh
new file mode 100755
index 0000000..66fe719
--- /dev/null
+++ b/core/table.sh
@@ -0,0 +1,27 @@
+# table.sh file, part of RACS ORF pipeline
+# this script is internal to the RACS ORF pipeline, and it is called from countReads.sh
+
+# command-line arguments
+FILE=$1
+#FILE=T_thermophila_June2014.gff3
+filter1=gene
+filter2="Name=TTHERM_"
+#filter3='"hypothetical protein"'
+
+TARGET=$(grep $filter1 $FILE | grep $filter2)
+
+#grep $filter1 $FILE | grep $filter2 | awk '{print $1":"$4"-"$5"  "$9}
+
+# grab scafold and genes' range
+grep $filter1 $FILE | grep $filter2 | awk '{print $1" "$4"-"$5}' > tmp0
+# grab "TTHERM"
+grep $filter1 $FILE | grep $filter2 | awk 'BEGIN{FS="TTHERM"} {print $2}' | awk 'BEGIN{FS=";Note"} {print "TTHERM"$1}' > tmp1
+# grab 'Note' and replace 'spaces' with 'underscores (_)'
+grep $filter1 $FILE | grep $filter2 | awk 'BEGIN{FS="Note="} {print "Note="$2}' | sed 's/ /_/g' > tmp2
+# compute gene size
+grep $filter1 $FILE | grep $filter2 | awk '{print $5-$4+1}' > tmp3
+
+# put all together to generate table
+paste tmp0 tmp1 tmp2 tmp3 | sort  -k 1 | sort -t: -n -k 2  | awk '{print $1":"$2"\t"$3"\t"$4"\t"$5"\t"$6}' > table.$FILE
+rm tmp0 tmp1 tmp2 tmp3
+
diff --git a/core/test/lst b/core/test/lst
new file mode 100644
index 0000000..511b51e
--- /dev/null
+++ b/core/test/lst
@@ -0,0 +1,8 @@
+countReads.sh
+table.sh
+comb_tables.sh
+intergenic/interGenes.sh
+intergenic/interGeneRegions.R
+intergenic/det-interGenes.sh
+auxs/testFns.sh
+auxs/auxFns.sh
diff --git a/datasets/IBD1_Intergenic.xlsx b/datasets/IBD1_Intergenic.xlsx
new file mode 100644
index 0000000..8508802
Binary files /dev/null and b/datasets/IBD1_Intergenic.xlsx differ
diff --git a/datasets/IBD1_ORF.xlsx b/datasets/IBD1_ORF.xlsx
new file mode 100644
index 0000000..75ea369
Binary files /dev/null and b/datasets/IBD1_ORF.xlsx differ
diff --git a/datasets/PostProcessing_Genic.xlsx b/datasets/PostProcessing_Genic.xlsx
new file mode 100644
index 0000000..56b58a0
Binary files /dev/null and b/datasets/PostProcessing_Genic.xlsx differ
diff --git a/datasets/PostProcessing_Intergenic.xlsx b/datasets/PostProcessing_Intergenic.xlsx
new file mode 100644
index 0000000..87cd531
Binary files /dev/null and b/datasets/PostProcessing_Intergenic.xlsx differ
diff --git a/hpc/IGR-jobs_parallel.gnu b/hpc/IGR-jobs_parallel.gnu
new file mode 100644
index 0000000..9f91788
--- /dev/null
+++ b/hpc/IGR-jobs_parallel.gnu
@@ -0,0 +1,53 @@
+#!/bin/bash
+#
+## example of a sequence of RACS IGR jobs using GNU-parallel
+
+#####
+errMsg() { msg "$*"; helpMsg;  exit 111;}
+####
+
+# this script requires GNU-PARALLEL to work, so let's first
+# be sure that it installed in the system...
+GNUPARALLEL=$(which parallel)
+[ -z $GNUPARALLEL ] && errMsg "This script uses GNU-PARALLEL to launch in parallel a series of InterGenic Region processes" || echo "Using $GNUPARALLEL"
+
+# set location of RACS
+RACSloc=$SCRATCH"/REPO/core/"
+interGenic=$RACSloc"/intergenic/det-interGenes.sh"
+
+# select reference GFF3 file
+REFgff3=$SCRATCH"/negCtrl/refs/T_thermophila_June2014.gff3"
+
+# location of ORF results used as input for the IGR script
+ORFdirs="ORF_RACS_results-"	#20181212-141459"	#"ORF_RACS_results-"
+
+
+# function to explore directories with RACS ORF results
+processFiles() {
+	for i in $ORFdirs*; do
+		cd $i; #pwd; #echo FINAL.table.*1;
+		# generate list of sample files per directory
+		ls -1 *fastq.gz-sorted.bam > sample.input
+		### check the actual alingment and final.table files are there...
+		alnFiles=$(cat sample.input | grep -o -P '(?<=aln).*(?=.fastq.gz-sorted.bam)' | sed 's/ /-/')
+		IGRtable="FINAL.table."$alnFiles
+		#echo "expected files to process..." $alnFiles $IGRtable
+		###
+		execute=$(for IGRtable in FINAL.table*1; do	\
+			echo	$interGenic  $IGRtable $REFgff3 tableIGR--$IGRtable sample.input ;
+			done)
+		echo "echo $i &&  cd $i &&      pwd     &&	$execute   && 	cd .. ";
+		cd ..
+	done
+}
+
+# specify name of the file were to record the log of the jobs
+LOG=$USER-IGR_RACS-`date '+%Y%m%d-%H%M%S'`.log
+
+# number of tasks to run at the same time, ideally it could be equal to
+#he number of processors (or twice if hyper-threading is availble)
+cores=$(grep -P '^core id\t' /proc/cpuinfo| wc -l)
+PARALLEL_TASKS_PER_NODE=$cores
+
+# process all the jobs via GNU-PARALLEL
+processFiles  | $GNUPARALLEL --joblog $LOG  -j $PARALLEL_TASKS_PER_NODE
diff --git a/hpc/modules b/hpc/modules
new file mode 100644
index 0000000..601367c
--- /dev/null
+++ b/hpc/modules
@@ -0,0 +1,8 @@
+# Example of module loading script
+# for SciNet/Niagara system  -- using CCEnv and SAMTOOLS and BWA + required supporting packages nixpkgs and intel compiler
+# November 2018
+
+module load CCEnv
+module load nixpkgs/16.09  intel/2018.3 samtools/1.9 bwa/0.7.17
+
+module list
diff --git a/hpc/submission.pbs b/hpc/submission.pbs
new file mode 100644
index 0000000..b6ea980
--- /dev/null
+++ b/hpc/submission.pbs
@@ -0,0 +1,23 @@
+#!/bin/bash
+# MOAB/Torque submission script for SciNet GPC
+#
+#PBS -l nodes=1:ppn=16,walltime=10:00:00
+#PBS -N RACS_ORF_run -q largemem
+
+# load modules, ie. to use, eg. SamTools and BWA
+module load samtools/1.3.1 bwakit
+
+# DIRECTORY TO RUN - $PBS_O_WORKDIR is directory job was submitted from
+cd $PBS_O_WORKDIR
+
+# EXECUTION COMMAND
+# RACS repo located on $SCRATCH
+# data subdirectory containing data files: INPUT and IP files
+# refs subdirectory containing reference files: fasta and gff3 files
+# the pipeline will attempt to use RAMDisk (/dev/shm) and autodetect maximum number of cores to use
+($SCRATCH/RACS/core/countReads.sh        \
+        data/SSCH1-1_Input_S2_R1_001.fastq.gz  data/SSCH1-1_ChIP_S1_R1_001.fastq.gz       \
+        refs/T_thermophila_June2014_assembly.fasta  refs/T_thermophila_June2014.gff3  /dev/shm  \
+        >& output.log) &
+
+wait
diff --git a/hpc/submission.sbatch b/hpc/submission.sbatch
new file mode 100644
index 0000000..ea3baf0
--- /dev/null
+++ b/hpc/submission.sbatch
@@ -0,0 +1,31 @@
+#!/bin/bash -l
+##SBATCH -p debug
+#SBATCH -N 1
+#SBATCH --ntasks-per-node=40
+#SBATCH -t 1:00:00
+#SBATCH -J RACS_negative_control
+#SBATCH --mail-user=YOURemailHERE
+#SBATCH --mail-type=ALL
+
+
+# load needed modules // RACS repository located on $SCRATCH
+. $SCRATCH/RACS/hpc/modules
+
+echo $SLURM_SUBMIT_DIR
+cd $SLURM_SUBMIT_DIR
+
+
+date
+
+# RACS repo located on $SCRATCH
+# data subdirectory containing data files: INPUT and IP files
+# refs subdirectory containing reference files: fasta and gff3 files
+# the pipeline will attempt to use RAMDisk (/dev/shm) and autodetect maximum number of cores to use
+($SCRATCH/RACS/core/countReads.sh        \
+        data/SSCH1-1_Input_S2_R1_001.fastq.gz  data/SSCH1-1_ChIP_S1_R1_001.fastq.gz       \
+        refs/T_thermophila_June2014_assembly.fasta  refs/T_thermophila_June2014.gff3  /dev/shm  \
+        >& output.log;	\
+echo $?;	\
+date) &
+
+wait
diff --git a/tools/BD1_peaks.xlsx b/tools/BD1_peaks.xlsx
new file mode 120000
index 0000000..cd5a2a5
--- /dev/null
+++ b/tools/BD1_peaks.xlsx
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/BD1_peaks.xlsx
\ No newline at end of file
diff --git a/tools/BD1_peaks_MACS2.xlsx b/tools/BD1_peaks_MACS2.xlsx
new file mode 120000
index 0000000..13acbb7
--- /dev/null
+++ b/tools/BD1_peaks_MACS2.xlsx
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/BD1_peaks_MACS2.xlsx
\ No newline at end of file
diff --git a/tools/BD2_peaks.xlsx b/tools/BD2_peaks.xlsx
new file mode 120000
index 0000000..6c015a3
--- /dev/null
+++ b/tools/BD2_peaks.xlsx
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/BD2_peaks.xlsx
\ No newline at end of file
diff --git a/tools/BD2_peaks_MACS2.xlsx b/tools/BD2_peaks_MACS2.xlsx
new file mode 120000
index 0000000..77e583b
--- /dev/null
+++ b/tools/BD2_peaks_MACS2.xlsx
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/BD2_peaks_MACS2.xlsx
\ No newline at end of file
diff --git a/tools/Ibd1-1.csv b/tools/Ibd1-1.csv
new file mode 120000
index 0000000..6a80a15
--- /dev/null
+++ b/tools/Ibd1-1.csv
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/Ibd1-1.csv
\ No newline at end of file
diff --git a/tools/Ibd1-2.csv b/tools/Ibd1-2.csv
new file mode 120000
index 0000000..f2de721
--- /dev/null
+++ b/tools/Ibd1-2.csv
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/Ibd1-2.csv
\ No newline at end of file
diff --git a/tools/Ibd1.csv b/tools/Ibd1.csv
new file mode 120000
index 0000000..1c9c41f
--- /dev/null
+++ b/tools/Ibd1.csv
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/Ibd1.csv
\ No newline at end of file
diff --git a/tools/Ibd1.xlsx b/tools/Ibd1.xlsx
new file mode 120000
index 0000000..0534352
--- /dev/null
+++ b/tools/Ibd1.xlsx
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/Ibd1.xlsx
\ No newline at end of file
diff --git a/tools/Ibd2-1.csv b/tools/Ibd2-1.csv
new file mode 120000
index 0000000..0c9765e
--- /dev/null
+++ b/tools/Ibd2-1.csv
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/Ibd2-1.csv
\ No newline at end of file
diff --git a/tools/Ibd2-2.csv b/tools/Ibd2-2.csv
new file mode 120000
index 0000000..cb4a5a7
--- /dev/null
+++ b/tools/Ibd2-2.csv
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/Ibd2-2.csv
\ No newline at end of file
diff --git a/tools/Ibd2.csv b/tools/Ibd2.csv
new file mode 120000
index 0000000..6255f48
--- /dev/null
+++ b/tools/Ibd2.csv
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/Ibd2.csv
\ No newline at end of file
diff --git a/tools/Ibd2.xlsx b/tools/Ibd2.xlsx
new file mode 120000
index 0000000..e66b7ac
--- /dev/null
+++ b/tools/Ibd2.xlsx
@@ -0,0 +1 @@
+../../../../chipseq-pipeline/compare2macs/Ibd2.xlsx
\ No newline at end of file
diff --git a/tools/compare.R b/tools/compare.R
new file mode 100644
index 0000000..ccda1b3
--- /dev/null
+++ b/tools/compare.R
@@ -0,0 +1,318 @@
+#########################################################################
+# RACS: Comparison Tools
+#
+# RACS utilities for comparison with MACS2 results
+#
+# requires "utils_RACS-compare.R" utilties file with fn defns
+#
+#
+#
+# HOW TO USE this script:
+#
+# Rscript interGeneRegions.R inputFile refFILE.gff3 outFile
+#
+#
+########################################################################
+#
+#
+# Function to establish the location of the script... hence the relative path to the utilities file
+# this function needs to be in the main driver script!
+#
+
+
+#########################################################################
+
+# function to load data from a spreadsheet format
+load.data <- function(filename,spix=1) {
+
+	# 1st check that the file exist
+	checkFile <- function(filename) {
+		while (!file.exists(filename)) {
+			cat(paste(filename,"not found!",'\n'))
+			cat("Please select a file")
+			filename <- file.choose()
+		}
+		return(filename)
+	}
+
+	# check that the XLSX package is available and load it
+	loadCheckPkg("xlsx")
+
+	cat(paste("Loading file",filename,'\n'))
+	filename <- checkFile(filename)
+
+	# reading data
+	sample <- read.xlsx(filename,spix)
+	return(sample)
+}
+
+
+# function to check whether a package is available in the system
+loadCheckPkg <- function(pckgs){
+	for (pckg in pckgs) {
+		# check whether the package is NOT loaded
+		if (! paste('package:',pckg,sep="") %in% search()) {
+			# check whether the package is avaialble in the system
+			if (pckg %in%  .packages(all.available = TRUE)) {
+				# load the package
+				cat("Loading library",pckg,"... \n")
+				library(pckg, character.only=TRUE)
+			} else {
+				msg <- paste("Package:",pckg, "not found! You need to install this package using",paste("install.package('",pckg,"')",sep=""))
+				stop(msg)
+			}
+		}
+	}
+}
+
+#######
+
+
+### TESTS CASES
+sample1 <- load.data("BD1_peaks.xlsx",1)
+sample2 <- load.data("BD1_peaks_MACS2.xlsx",1)
+
+
+# ACTUAL DATA
+sampleIBD1 <- read.csv("Ibd1-2.csv",sep='\t')
+sampleIBD2 <- read.csv("Ibd2-2.csv",sep='\t')
+
+macsIBD1 <- load.data("BD1_peaks_MACS2.xlsx",1)
+macsIBD2 <- load.data("BD2_peaks_MACS2.xlsx",1)
+
+## IBD1/2 cases
+ibdX <- function(sample,ref,filename="ibdX.pdf") {
+  ibdX <- comparison(sample,ref,FALSE)
+  pdf(file=filename)
+  vizDiffs(ibdX)
+  dev.off()
+}
+#####
+
+tests <- function() {
+	sample1 <- data.frame()
+	sample1 <- rbind(sample1, c(as.character("scfld_01"), as.numeric(1),as.numeric(101)) )
+
+}
+
+#######
+
+# MAIN fucntions of the comparison tool
+
+overlap <- function(x1,x2, y1,y2){
+   # NO overlap condition
+   if (x1 > y2) return(+1.0)
+   if (x2 < y1) return(-1.0)
+
+   ax = (x2 - x1)
+   ay = (y2 - y1)
+   #print(ax)
+   #print(ay)
+
+   #if ((x1<=y1) & (x2>=y2)) return(+ax/ay)
+   if (x1<=y1) return(+ax/ay)
+   #if ((x1>y1) & (x2<y2)) return(-ay/ax)
+   if (x1>y1) return(-ay/ax)
+   if ((x1==y1)&(x2==y2)) return(0)
+
+   return(NA)
+}
+
+######
+
+comparison.ALL <- function(sample1,sample2) {
+
+  # identify scaffolds
+  scflds_smpl1 <- unique(as.character(sample1$Region))
+  scflds_smpl2 <- unique(as.character(sample2$Region))
+
+  ### 1-to-1 comparison
+  results <- data.frame()
+  for (scfld in scflds_smpl1) {
+  #scfld <- scflds_smpl1[1]
+    print(scfld)
+    # define subsets based on the scalfold to work with
+    sset1 <- sample1[sample1$Region == scfld,]
+    sset2 <- sample2[sample2$Region == scfld,]
+    # order to subsets per intervals
+    sset1 <- sset1[order(sset1$start),]
+    sset2 <- sset2[order(sset2$start),]
+
+    for (i1 in 1:dim(sset1)[1]){
+        for (i2 in 1:dim(sset2)[1]) {
+            ovrlap <- overlap(sset1$start[i1],sset1$end[i1], sset2$start[i2], sset2$end[i2])
+            if (is.numeric(ovrlap)) {
+		cat(i1,i2,'\n')
+		print(ovrlap)
+		results <- rbind(results,c(scfld,sset1$start[i1],sset1$end[i1],sset2$start[i2], sset2$end[i2], ovrlap))
+	    }
+        }
+     }
+  }
+  colnames(results) <- c("scfld","x1","x2","y1","y2","overlap")
+  results[abs(results$overlap) != 1,]
+  return(results)
+}
+
+########
+
+comparison <- function(sample1,sample2, DBG=TRUE) {
+
+  # sort samples per scafold
+  sample1 <- sample1[order(sample1$Region),]
+  sample2 <- sample2[order(sample2$Region),]
+
+  # identify scaffolds
+  scflds_smpl1 <- unique(as.character(sample1$Region))
+  scflds_smpl2 <- unique(as.character(sample2$Region))
+  scflds <- c(scflds_smpl1,scflds_smpl2)
+  scflds <- unique(scflds[order(scflds)])
+
+  ### 1-to-region comparison
+  results <- c()
+  for (scfld in scflds_smpl1) {
+#  for (scfld in scflds) {
+  #scfld <- scflds_smpl1[1]
+    print(scfld)
+    # define subsets based on the scalfold to work with
+    sset1 <- sample1[sample1$Region == scfld,]
+    sset2 <- sample2[sample2$Region == scfld,]
+    # order to subsets per intervals
+    sset1 <- sset1[order(sset1$start),]
+    sset2 <- sset2[order(sset2$start),]
+    dsset1 <- dim(sset1)[1]
+    dsset2 <- dim(sset2)[1]
+
+    if (dsset1 == 0) {
+	if (DBG) {
+	# not data registered for this scaffold in sample1
+	cat("not data registered for this scaffold:",scfld," in sample1",'\n')
+	print(sset1)
+	print(sset2)
+        }
+	tmpreg <-  c(as.character(scfld),"--","--", sset2$start,sset2$end, as.numeric(-1))
+	if (DBG) print(tmpreg)
+        results <- rbind(results, as.character(tmpreg))
+    } else if (dsset2 == 0 ) {
+        if (DBG) {
+	cat("not data registered for this scaffold:",scfld," in sample2",'\n')
+        print(sset1)
+        print(sset2)
+        }
+        tmpreg <-  c(as.character(scfld),"--","--", sset1$start,sset1$end, as.numeric(+1))
+        if (DBG) print(tmpreg)
+        results <- rbind(results, as.character(tmpreg))
+    } else {
+    for (i1 in 1:dsset1){
+        regResults <- c()
+        for (i2 in 1:dsset2) {
+            ovrlap <- overlap(sset1$start[i1],sset1$end[i1], sset2$start[i2], sset2$end[i2])
+            if (!is.na(ovrlap)) {
+                cat(scfld,i1,i2,"---",sset1$start[i1],sset1$end[i1], sset2$start[i2], sset2$end[i2],'\n')
+                print(ovrlap)
+                newRecord <-  c(as.character(scfld),as.numeric(sset1$start[i1]),as.numeric(sset1$end[i1]),as.numeric(sset2$start[i2]), as.numeric(sset2$end[i2]), as.numeric(ovrlap))
+		if (DBG) print(paste("<<<<<<",newRecord))
+		regResults <- rbind(regResults, as.character(newRecord))
+		#regResults <- rbind(regResults,  c(as.character(scfld),as.numeric(sset1$start[i1]),as.numeric(sset1$end[i1]),as.numeric(sset2$start[i2]), as.numeric(sset2$end[i2]), as.numeric(ovrlap)) )
+            }
+        }
+        #regResults <- cbind(scfld,regResults)
+        colnames(regResults) <- c("scafold","x1","x2","y1","y2","overlap")
+	regResults <- as.data.frame(regResults)
+	filterCond <- abs(as.numeric(as.character(regResults[!is.na(regResults$overlap),]$overlap)))
+        if ( dim(regResults[filterCond == 1,])[1] == dsset2 ) {
+           # not match at all
+           results <- rbind(results, c(as.character(scfld),sset1$start[i1],sset1$end[i1], "--","--", as.numeric(as.character(regResults$overlap[1]))))
+        } else {
+		##results <- rbind(results, regResults[abs(regResults$overlap) != 1, ])
+		#results <- rbind(results, regResults[filterCond != 1,])
+		matchingRegs <- regResults[filterCond != 1,]
+		if (DBG) { cat("######"); print(matchingRegs) ; print(dim(matchingRegs))}
+                #####if (dim(matchingRegs)[1] > 1) stop()
+		for (k in 1:dim(matchingRegs)[1]) {
+                    krecord <- matchingRegs[k,]
+		    newMatchingReg <- c(as.character(krecord$scafold),as.character(krecord$x1),as.character(krecord$x2),as.character(krecord$y1),as.character(krecord$y2),as.character(krecord$overlap))
+		    if (DBG) { cat("###### ::: k   ... "); print(newMatchingReg) }
+		#######results <- rbind(results, c(as.character(matchingRegs$scafold),as.character(matchingRegs$x1),as.character(matchingRegs$x2),as.character(matchingRegs$y1),as.character(matchingRegs$y2),as.character(matchingRegs$overlap)) )
+		#results <- rbind(results, c(as.character(scfld),sset1$start[i1],sset1$end[i1],sset2$start[i2], sset2$end[i2],as.numeric(as.character(matchingRegs$overlap))) )
+		    results <- rbind(results,as.character(newMatchingReg))
+                }
+        }
+     }
+  }
+  }
+  colnames(results) <- c("scafold","x1","x2","y1","y2","overlap")
+  rownames(results) <- c()
+  results <- as.data.frame(results)
+  results$overlap <- as.numeric(as.character(results$overlap))
+  results[abs(results$overlap) != 1,]
+
+  return(results)
+}
+
+######
+
+vizDiffs <- function(results, filename="", threshold=1000, iplots=FALSE) {
+	res <- results[!is.na(results$overlap),]
+	res <- res[res$overlap<threshold,]
+	yvar <- res$overlap
+
+	# Determine max/min in plots
+	mmin <- round(min(yvar))
+	mmax <- round(max(yvar))
+        mm <- max(abs(mmin),abs(mmax))
+	yrange <- c(-mm,+mm)
+	xrange <- c(1, length(yvar))
+
+	# interactive plot
+	if (iplots) {
+		loadCheckPkg("plotly")
+		#loadCheckPkg("pandoc")
+		p1 <- plot_ly(x = c(results$scafold), y = results$overlap, type="bar")
+		p2 <- plot_ly(y = results$overlap, group=results$scafold, type="bar")
+		p3 <- plot_ly(x=~results$scafold, y = results$overlap, group=results$scafold, type="bar")
+
+		htmlwidgets::saveWidget(as.widget(p1), paste(filename,"-p1.html"), selfcontained=FALSE)
+		htmlwidgets::saveWidget(as.widget(p2), paste(filename,"-p2.html"), selfcontained=FALSE)
+		htmlwidgets::saveWidget(as.widget(p3), paste(filename,"-p3.html"), selfcontained=FALSE)
+	}
+
+	# static plots
+	if (filename!="") pdf(file=filename)
+	if (filename=="") dev.new()
+	plot(res$scafold,yvar)
+	abline(h = 0, v = 0, col = "gray60")
+
+	if (filename=="") dev.new()
+	#par(mfrow=c(2,1))
+	barplot(res$overlap, ylim=yrange)
+	box()
+	par(new=T)
+	plot(yvar, cex=0.5, ylim=yrange, ann=F, axes=F)
+	abline(h = 0, v = 0, col = "gray60")
+
+	if (filename=="") dev.new()
+	par(mfrow=c(2,1))
+	plot(res$overlap, cex=0.5, xlim=xrange, ylim=yrange, type='b')
+	abline(h = 0, v = 0, col = "gray60")
+	#par(new=TRUE)
+	plot(which(res$overlap>0), res[res$overlap>0,]$overlap, cex=.5, col='blue', xlim=xrange, ylim=yrange)
+	par(new=TRUE)
+	plot(which(res$overlap<0), res[res$overlap<0,]$overlap, cex=.5, col='red', xlim=xrange, ylim=yrange)
+	abline(h = 0, v = 0, col = "gray60")
+
+
+        if (filename=="") dev.new()
+	par(mfrow=c(1,1))
+	g <- yvar
+	h <- hist(g, freq=TRUE, breaks = 75, xlim=c(-50,50), density = 40, col = " lightgray"  , xlab = "overlap", main = "RACS vs MACS")
+	xfit <- seq(min(g), max(g), length = length(g))
+	yfit <- dnorm(xfit, mean = mean(g), sd = sd(g))
+	yfit <- yfit * diff(h$mids[1:2]) * length(g)
+	lines(xfit, yfit, col = "black", lwd = 2)
+	text(0,max(yfit*1.10),paste("mean=",mean(g), " -- ", "sd=",sd(g)))
+
+	if (filename!="") dev.off()
+}
+
+######
diff --git a/tools/ibdX.pdf b/tools/ibdX.pdf
new file mode 100644
index 0000000..7f2a4d2
Binary files /dev/null and b/tools/ibdX.pdf differ

ViewGit