HPCs

HPCs

Request resource for development environment

#!/bin/bash

module load ufrc
module load slurm
srundev --account=epi --qos=epi-b --time=02:00:00


Download files from online sequence repository- vendor

#!/bin/sh
#SBATCH -p hpg1-compute
#SBATCH --account=epi
#SBATCH --qos=epi
#SBATCH --job-name=fastspar
#SBATCH --mail-type=ALL
#SBATCH --mail-user=rp3448@ufl.edu
#SBATCH --ntasks=1 
#SBATCH --mem=2gb
#SBATCH --time=72:00:00
date;hostname;pwd

wget --user=ravin --password=????????? http://hwwgenotyping.ksu.edu/data/ravin/2017/FLB2017AM-GBS_001-3_job319/AAAA-FLB2017_1_1.fastq

## ??????? == password


Basic job script

#!/bin/sh
#SBATCH -p hpg1-compute
#SBATCH --account=epi
#SBATCH --qos=epi-b
#SBATCH --job-name=mothur_NCBI
#SBATCH --mail-type=ALL
#SBATCH --mail-user=rp3448@ufl.edu
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=48 
#SBATCH --mem=64gb
#SBATCH --time=04:00:00
date;hostname;pwd

# load the required programs
module load mothur

mothur  GetFastQs_Ravin_bac2014.txt


Example of Array job - Running SparCC

#!/bin/sh
#SBATCH -p hpg1-compute
#SBATCH --account=epi
#SBATCH --qos=epi-b
#SBATCH --job-name=SparCC
#SBATCH --mail-type=ALL
#SBATCH --mail-user=rp3448@ufl.edu
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16 
#SBATCH --mem=8gb
#SBATCH --time=48:00:00
#SBATCH --output=logs/SparCC_%A_%a.log
#SBATCH --array=0-99
date;hostname;pwd

# load the required programs
module load slurm
module load ufrc
module load fastspar
module load parallel
module load sparcc

SparCC.py data/permutation_${SLURM_ARRAY_TASK_ID}.txt -i 20 --cor_file=data/cor_permutation_${SLURM_ARRAY_TASK_ID}.txt


Create a file with list of all the jobs, hear GetFastQs_Ravin_bac2015.txt has list of steps for each barcode. For 200 barcode we want to run some bioinformatics step. One can let the job run one at a time, which takes a lot of time. Best thing will be to use multiple computers - 200 computers for 200 barcode, then run the job simultaneously. If for one barcode it takes about 15 minutes to complete the job, then for 200 barcode it will still take the same 15 minutes. If ran one by one then it will take 15 x 200 minutes.


#  mothur_execute_preprocess_bac2014.sh
#!/bin/sh
#SBATCH -p hpg1-compute
#SBATCH --account=epi
#SBATCH --qos=epi-b
#SBATCH --job-name=mothur_NCBI
#SBATCH --mail-type=ALL
#SBATCH --mail-user=rp3448@ufl.edu
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=48 
#SBATCH --mem=64gb
#SBATCH --time=04:00:00
date;hostname;pwd

# load the required programs
module load mothur

mothur mothur_preprocess_bac2014.txt

grep groups GetFastQs_Ravin_Bac2014.txt  | awk '{print $3}' | sed 's/groups=//' | sed 's:);::' > groups.txt


What is mothur_preprocess_bac2014.txt ?

make.contigs(ffastq=Tomato15BacEndo_R1.fastq, rfastq=Tomato15BacEndo_R2.fastq,processors=48);
summary.seqs(fasta=Tomato15BacEndo_R1.trim.contigs.fasta, processors=48);
trim.seqs(fasta=Tomato15BacEndo_R1.trim.contigs.fasta,oligos=Tomato15BacEndoOligos.txt, pdiffs=1, bdiffs=1, flip=T, processors=48);
summary.seqs(fasta=Tomato15BacEndo_R1.trim.contigs.trim.fasta, processors=48);


What is GetFastQs_Ravin_Bac2014.txt ?

get.groups(group=Tomato15Bac_R1.trim.contigs.groups, fasta=Tomato15Bac_R1.trim.contigs.trim.fasta, groups=CH.15.Endo.M.1);
list.seqs(fasta=Tomato15Bac_R1.trim.contigs.trim.pick.fasta);
get.seqs(fastq=Tomato15Bac_R1.fastq, accnos=Tomato15Bac_R1.trim.contigs.trim.pick.accnos);
system(cp Tomato15Bac_R1.pick.fastq CH.15.Endo.M.1_R1_001.fastq);
get.seqs(fastq=Tomato15Bac_R2.fastq, accnos=Tomato15Bac_R1.trim.contigs.trim.pick.accnos);
system(cp Tomato15Bac_R2.pick.fastq CH.15.Endo.M.1_R2_001.fastq);
.
.
.
.
.
# repeats for each barcode

Array Job where input fasta file is same for all, but has different metafile.

#SBATCH -p hpg1-compute
#SBATCH --account=epi
#SBATCH --qos=epi-b
#SBATCH --mail-type=END,FAIL
#SBATCH --mail-user=rp3448@ufl.edu
#SBATCH --output=mothur_logs/mothur_%A-%a.log
#SBATCH --ntasks=1
#SBATCH --mem=2gb
#SBATCH --time=2:00:00
#SBATCH --array=1-200
date;hostname;pwd

# load the required programs
module load mothur

RUN=${SLURM_ARRAY_TASK_ID}
echo "RUN: '${RUN}'"
group=$(sed -n ${RUN}p groups.txt)  ### here groups.txt has a list of filenames.- see above for how groups.txt was created.
echo "Processing group '${group}'"
rm -rf "${group}"                        
mkdir -p "${group}"
cd "${group}"

ln -s ../Tomato14BacEndo_R1.fastq .
ln -s ../Tomato14BacEndo_R2.fastq .
ln -s ../Tomato14BacEndo_R1.trim.contigs.trim.fasta .
ln -s  ../Tomato14BacEndo_R1.trim.contigs.groups .

cp ../mothur_group_endo.txt . 

sed -i "s:MOTHUR_GROUP:${group}:g" mothur_group_endo.txt

mothur mothur_group_endo.txt

cp "${group}"*.fastq ../demultiplex_fastq/

date

cd ..

rm -rf "${group}"


Avatar
Ravin Poudel
Computational Biologist (PostDoc)

Related