====== myIllu_06_sga.sh ====== #!/bin/bash -x BASE=$(basename "$1") # delete any leading path BASE=${BASE%%.*} rm temp_1.fq temp_2.fq # # Parameters # # Program paths SGA_BIN=/usr/local/bin/sga BWA_BIN=/usr/local/apps/bwa-0.7.12/bwa SAMTOOLS_BIN=/usr/local/apps/samtools-1.2/samtools BAM2DE_BIN=/usr/local/apps/sga/src/bin/sga-bam2de.pl ASTAT_BIN=/usr/local/apps/sga/src/bin/sga-astat.py DISTANCE_EST=/usr/local/apps/abyss-1.9.0/DistanceEst/DistanceEst SEQTK=/usr/local/apps/seqtk/seqtk # The number of threads to use CPU=20 # Correction k-mer CORRECTION_K=41 # The minimum overlap to use when computing the graph. # The final assembly can be performed with this overlap or greater MIN_OVERLAP=85 # The overlap value to use for the final assembly ASSEMBLE_OVERLAP=111 # Branch trim length TRIM_LENGTH=400 # The minimum length of contigs to include in a scaffold MIN_CONTIG_LENGTH=200 # The minimum number of read pairs required to link two contigs MIN_PAIRS=10 # # Dependency checks # # Check the required programs are installed and executable prog_list="$SGA_BIN $BWA_BIN $SAMTOOLS_BIN $BAM2DE_BIN $DISTANCE_EST $ASTAT_BIN" for prog in $prog_list; do hash $prog 2>/dev/null || { echo "Error $prog not found. Please place $prog on your PATH or update the *_BIN variables in this script"; exit 1; } done # interleaved file should be separated for bwa mapping echo "Separating interleaved file $1 into two temporary paired files..." $SEQTK seq -1 $1 > temp_1.fq $SEQTK seq -2 $1 > temp_2.fq IN1=temp_1.fq IN2=temp_2.fq # Check the files are found file_list="$IN1 $IN2" for input in $file_list; do if [ ! -f $input ]; then echo "Error input file $input not found"; exit 1; fi done # # Preprocessing # # Preprocess the data to remove ambiguous basecalls # You can specify -m, --min-length=INT option here, which is most useful when used in # conjunction with --quality trim, Default: 40 $SGA_BIN preprocess -m 101 --pe-mode 2 -o ${BASE}.pp.fastq $1 # # Error Correction # # Build the index that will be used for error correction # As the error corrector does not require the reverse BWT, suppress # construction of the reversed index $SGA_BIN index -a ropebwt -t $CPU --no-reverse ${BASE}.pp.fastq # Perform k-mer based error correction. # The k-mer cutoff parameter is learned automatically. $SGA_BIN correct -k $CORRECTION_K --learn -t $CPU -o ${BASE}.ec.fastq ${BASE}.pp.fastq exit # # Primary (contig) assembly # # Index the corrected data. $SGA_BIN index -a ropebwt -t $CPU ${BASE}.ec.fastq # Remove exact-match duplicates and ${BASE}.with low-frequency k-mers $SGA_BIN filter -x 2 -t $CPU ${BASE}.ec.fastq # Compute the structure of the string graph $SGA_BIN overlap -m $MIN_OVERLAP -t $CPU ${BASE}.ec.filter.pass.fa # Perform the contig assembly $SGA_BIN assemble -m $ASSEMBLE_OVERLAP --min-branch-length $TRIM_LENGTH -o primary ${BASE}.ec.filter.pass.asqg.gz # # Scaffolding # PRIMARY_CONTIGS=primary-contigs.fa PRIMARY_GRAPH=primary-graph.asqg.gz # Align the reads to the contigs $BWA_BIN index $PRIMARY_CONTIGS $BWA_BIN aln -t $CPU $PRIMARY_CONTIGS $IN1 > $IN1.sai $BWA_BIN aln -t $CPU $PRIMARY_CONTIGS $IN2 > $IN2.sai $BWA_BIN sampe $PRIMARY_CONTIGS $IN1.sai $IN2.sai $IN1 $IN2 | $SAMTOOLS_BIN view -Sb - > libPE.bam # Convert the BAM file into a set of contig-contig distance estimates $BAM2DE_BIN -n $MIN_PAIRS -m $MIN_CONTIG_LENGTH --prefix libPE libPE.bam # Compute copy number estimates of the contigs $ASTAT_BIN -m $MIN_CONTIG_LENGTH libPE.bam > libPE.astat # Build the scaffolds $SGA_BIN scaffold -m $MIN_CONTIG_LENGTH -a libPE.astat -o scaffolds.scaf --pe libPE.de $PRIMARY_CONTIGS # Convert the scaffolds to FASTA format $SGA_BIN scaffold2fasta --use-overlap --write-unplaced -m $MIN_CONTIG_LENGTH -a $PRIMARY_GRAPH -o sga-scaffolds.fa scaffolds.scaf # Remove temporary paired files #rm temp_1.fq temp_2.fq temp_1.fq.sai temp_2.fq.sai