1. Quality Control
1.1 SRA to fastq
Software:
fastq-dump in SRAToolkit
Software download:
https://github.com/ncbi/sra-tools
Reference script:
out_dir=./your_sra_folder [ -d "$out_dir" ] || mkdir -p "$out_dir" for i in $(ls ./your_sra_folder/*.sra); do echo $i /absolute_path_of_fastq-dump "$i" \ --split-3 --gzip -O "$out_dir"; done
1.2 Fastp
Software:
fastp
Software download:
https://github.com/OpenGene/fastp
Reference script:
fastp=/absolute_path_of_fastp adapters=/absolute_path_of_S.cerevisiae_adapters_with_BGI.fa input_dir=./your_fastq_folder out_dir=./fastq_after_fastp sample_id=$(find $input_dir -maxdepth 1 -iname "*_1.fastq.gz" -exec basename {} _1.fastq.gz \; | sort | uniq) parallel --jobs 5 --header : --colsep ' ' \ ' out_sub_dir={out_dir}/{sample_id} [ -d $out_sub_dir ] || mkdir -p $out_sub_dir {fastp} \ -i {input_dir}/{sample_id}_1.fastq.gz \ -I {input_dir}/{sample_id}_2.fastq.gz \ -o $out_sub_dir/{sample_id}.clean.R1.fq.gz \ -O $out_sub_dir/{sample_id}.clean.R2.fq.gz \ --adapter_fasta {adapters} \ --thread 5 ' \ ::: sample_id $sample_id \ ::: input_dir $input_dir \ ::: out_dir $out_dir \ ::: adapters $adapters \ ::: fastp $fastp
1.3 Remove adapters
Software:
bbduk in bbmap
Software download:
https://sourceforge.net/projects/bbmap/
Reference script:
outbbduk=/absolute_path_of_bbmap/bbduk.sh adapters=/absolute_path_of_S.cerevisiae_adapters_with_BGI.fa input_dir=../your_fastq_after_fastp_folder out_dir=./fastq_after_rmv_adapters file_path=$(find $input_dir -maxdepth 2 -iname "*.fastp.html" ) parallel --jobs 8 --header : --colsep ',' \ ' sub_dir=$(dirname {file_path} | awk -F"/" "{print \$NF}") sub_input_dir={input_dir}/$sub_dir sub_out_dir={out_dir}/$sub_dir [ -d $sub_out_dir ] || mkdir -p $sub_out_dir file_name=$(basename -s .fastp.html {file_path}) {bbduk} \ in1=$sub_input_dir/"$file_name".clean.R1.fq.gz \ in2=$sub_input_dir/"$file_name".clean.R2.fq.gz \ out1=$sub_out_dir/"$file_name".clean.R1.fq.gz \ out2=$sub_out_dir/"$file_name".clean.R2.fq.gz \ ref={adapters} \ threads=8 ' \ ::: input_dir $input_dir \ ::: out_dir $out_dir \ ::: file_path $file_path \ ::: adapters $adapters \ ::: bbduk $bbduk
1.4 Remove phiX
Software:
bbduk in bbmap
Software download:
https://sourceforge.net/projects/bbmap/
Reference script:
bbduk=/absolute_path_of_bbmap/bbduk.sh phiX=/absolute_path_of_S.cerevisiae_phiX174.fasta input_dir=../your_fastq_after_rmv_adapters_folder out_dir=../fastq_after_filter file_path=$(find $input_dir -maxdepth 2 -iname "*.paired_refstats.txt" ) parallel --jobs 8 --header : --colsep ',' \ ' sub_dir=$(dirname {file_path} | awk -F"/" "{print \$NF}") sub_input_dir={input_dir}/$sub_dir sub_out_dir={out_dir}/$sub_dir [ -d $sub_out_dir ] || mkdir -p $sub_out_dir file_name=$(basename -s .paired_refstats.txt {file_path}) {bbduk} \ in1=$sub_input_dir/"$file_name".clean.R1.fq.gz \ in2=$sub_input_dir/"$file_name".clean.R2.fq.gz \ out1=$sub_out_dir/"$file_name".clean.R1.fq.gz \ out2=$sub_out_dir/"$file_name".clean.R2.fq.gz \ ref={phiX} \ threads=8 ' \ ::: input_dir $input_dir \ ::: out_dir $out_dir \ ::: file_path $file_path \ ::: phiX $phiX \ ::: bbduk $bbduk
2. Bam
Software:
bwa-mem2、samtools
Software download:
https://github.com/bwa-mem2/bwa-mem2
http://www.htslib.org/download/
Reference script:
bwa_mem2=/absolute_path_of_bwa-mem2 ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna input_dir=../your_fastq_after_filter_folder out_dir=./1_after_bam select_list=./your_strain_name_list.csv parallel --jobs 4 -a $select_list --header : --colsep ',' \ ' out_dir_sub={out_dir}/{SM} [ -d "$out_dir_sub" ] || mkdir -p "$out_dir_sub" out_Paired_file="$out_dir_sub"/{SM}_Paired.bam {bwa_mem2} mem -t 16 -R "@RG\tID:{ID}\tPL:illumina\tSM:{SM}" \ {ref_path} \ {input_dir}/{Name}/{Name}.clean.R1.fq.gz \ {input_dir}/{Name}/{Name}.clean.R2.fq.gz | \ samtools view -b - | \ samtools sort - | \ samtools markdup -r - "$out_Paired_file" samtools index "$out_Paired_file" out_Unpaired_file="$out_dir_sub"/{SM}_Unpaired.bam {bwa_mem2} mem -t 16 -R "@RG\tID:{ID}\tPL:illumina\tSM:{SM}" \ {ref_path} \ {input_dir}/{Name}/{Name}.unpaired.fq.gz | \ samtools view -b - | \ samtools sort - | \ samtools markdup -r - "$out_Unpaired_file" samtools index "$out_Unpaired_file" ' \ ::: input_dir $input_dir \ ::: out_dir $out_dir \ ::: ref_path $ref_path \ ::: bwa_mem2 $bwa_mem2
3. Variation detection
Software:
samtools、parallel、GATK(Genome Analysis Toolkit)
Software download:
http://www.htslib.org/download/
https://gatk.broadinstitute.org/hc/en-us
Ubuntu install parallel:sudo apt-get install parallel
3.1 HaplotypeCaller
Reference script:
input_dir=../your_1_after_bam_folder out_dir=./gvcf filename=$(find $input_dir -maxdepth 2 -iname "*_Paired.bam" | sort ) ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna parallel --jobs 10 --header : --colsep ',' \ ' filename_pre=$(basename -s .mk_dup.bam {filename}) out_file={out_dir}/"$filename_pre".g.vcf.gz gatk HaplotypeCaller \ -R {ref_path} \ -I {filename} \ -O $out_file \ -ERC GVCF ' \ ::: out_dir $out_dir \ ::: filename $filename \ ::: ref_path $ref_path
3.2 GenomicsDBImport
Reference script:
out_file=2_2_GenomicsDBImport.sh if [ -f $out_file ]; then rm $out_file; fi && touch $out_file echo "#!/bin/bash gatk GenomicsDBImport \\ --genomicsdb-workspace-path ./Genomics_Database \\ --batch-size 300 \\ --reader-threads 60 \\ --intervals GenomicsDBImport.intervals \\ --tmp-dir ./tmp_dir" >> $out_file ls /gvcf_folder_generated_by_HaplotypeCaller/gvcf/*vcf | sed "s:^:\t-V :" | sed "s:$: \\\:" >> $out_file sed -i '$s/ \\$//' $out_file chmod u+x $out_file
3.3 GenotypeGVCFs all site
3.3.1 Reference script:
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna variant_dir=../2_GenomicsDBImport/Genomics_Database out_dir=./3_1_Chr_Split intervals=./intervals_without_mtDNA.list parallel --jobs 7 -a $intervals --header : --colsep ',' \ ' gatk GenotypeGVCFs \ --reference {ref_path} \ --include-non-variant-sites \ --variant gendb://{variant_dir} \ --intervals {intervals} \ --output {out_dir}/3_1_{intervals}.vcf.gz ' \ ::: ref_path $ref_path \ ::: variant_dir $variant_dir \ ::: out_dir $out_dir
3.3.2 Reference script:
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna input=./your_3.3.1_VCF_path.list output_dir=./3_3_GenotypeGVCFs_all_site output_file=3_3_GenotypeGVCFs_all_site.vcf.gz gatk MergeVcfs \ --INPUT $input \ --OUTPUT "$output_dir/$output_file" \ --REFERENCE_SEQUENCE $ref_path
3.4 SelectVariants
Reference script:
input_file=../3_GenotypeGVCFs_all_site/3_3_GenotypeGVCFs_all_site/3_3_GenotypeGVCFs_all_site.vcf.gz out_dir=./4_SelectVariants type="SNP INDEL MIXED MNP SYMBOLIC" parallel --jobs 6 --header : --colsep ',' \ 'gatk SelectVariants \ -select-type {type} \ -V {input_file} \ -O {out_dir}/4_{type}.vcf.gz' \ ::: input_file $input_file \ ::: out_dir $out_dir \ ::: type $type
3.5 VariantFiltration Mark
3.5.1 Reference script:
input_file=../4_SelectVariants/4_SNP.vcf.gz out_dir=./5_1_Split_SNPS_by_Chrom_Original intervals=./intervals_without_mtDNA.list parallel --jobs 6 -a $intervals --header : --colsep ',' \ 'gatk SelectVariants \ --intervals {intervals} \ -V {input_file} \ -O {out_dir}/{intervals}_SNP.vcf.gz' \ ::: input_file $input_file \ ::: out_dir $out_dir
3.5.2 Reference script:
input_dir=./5_1_Split_SNPS_by_Chrom_Original file_path=$(find $input_dir -maxdepth 1 -iname "*.vcf.gz" ) out_dir=./5_2_Split_SNPS_by_Chrom_Mark parallel --jobs 6 --header : --colsep ' ' \ ' intervals=$(basename -s _SNP.vcf.gz {file_path}) gatk VariantFiltration \ -V {file_path} \ -filter "QD < 2.0" --filter-name "QD2" \ -filter "QUAL < 30.0" --filter-name "QUAL30" \ -filter "SOR > 3.0" --filter-name "SOR3" \ -filter "FS > 60.0" --filter-name "FS60" \ -filter "MQ < 40.0" --filter-name "MQ40" \ -filter "MQRankSum < -12.5" --filter-name "MQRankSum-12.5" \ -filter "ReadPosRankSum < -8.0" --filter-name "ReadPosRankSum-8" \ -O {out_dir}/"$intervals"_SNP_Mark.vcf.gz ' \ ::: file_path $file_path \ ::: out_dir $out_dir
3.5.3 Reference script:
input_dir=../4_SelectVariants/4_SelectVariants out_dir=./5_3_VariantFiltration_Mark # Indel hard-filtering gatk VariantFiltration \ -V $input_dir/4_INDEL.vcf.gz \ -filter "QD < 2.0" --filter-name "QD2" \ -filter "QUAL < 30.0" --filter-name "QUAL30" \ -filter "FS > 200.0" --filter-name "FS200" \ -filter "ReadPosRankSum < -20.0" --filter-name "ReadPosRankSum-20" \ -O $out_dir/6_3_INDEL_Mark_Flag.vcf.gz # Mixed hard-filtering gatk VariantFiltration \ -V $input_dir/4_MIXED.vcf.gz \ -filter "QD < 2.0" --filter-name "QD2" \ -filter "QUAL < 30.0" --filter-name "QUAL30" \ -filter "FS > 200.0" --filter-name "FS200" \ -filter "ReadPosRankSum < -20.0" --filter-name "ReadPosRankSum-20" \ -O $out_dir/6_3_MIXED_Mark_Flag.vcf.gz
3.6 Set filtered gt to nocall
3.6.1 Reference script:
input_dir=../5_VariantFiltration_Mark/5_2_Split_SNPS_by_Chrom_Mark file_path=$(find $input_dir -maxdepth 1 -iname "*.vcf.gz" ) out_dir=./6_1_Split_SNPS_by_Chrom [ -d $out_dir ] || mkdir -p $out_dir parallel --jobs 16 --header : --colsep ',' \ ' intervals=$(basename -s _SNP_Mark.vcf.gz {file_path}) gatk SelectVariants \ -V {file_path} \ --set-filtered-gt-to-nocall \ -O {out_dir}/"$intervals"_SNP_Filter_to_Nocall.vcf.gz ' \ ::: file_path $file_path \ ::: out_dir $out_dir
3.6.2 Reference script:
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna input=./your_3.6.1_VCF_path.list output_dir=./6_Filter_to_Nocall output_file=6_SNP_Filter_to_Nocall.vcf.gz out="$output_dir/$output_file" [ -d $output_dir ] || mkdir -p $output_dir gatk MergeVcfs \ --INPUT $input \ --OUTPUT $out \ --REFERENCE_SEQUENCE $ref_path
3.6.3 Reference script:
input_dir=../5_VariantFiltration_Mark/5_3_VariantFiltration_Mark out_dir=./6_Filter_to_Nocall [ -d $out_dir ] || mkdir -p $out_dir type="INDEL MIXED" parallel --jobs 3 --header : --colsep ',' \ 'gatk SelectVariants \ -V {input_dir}/5_3_{type}_Mark_Flag.vcf.gz \ --set-filtered-gt-to-nocall \ -O {out_dir}/6_{type}_Filter_to_Nocall.vcf.gz' \ ::: input_dir $input_dir \ ::: out_dir $out_dir \ ::: type $type mv $input_dir/5_3_NO_VARIATION_Mark_Flag.vcf.gz $out_dir/6_NO_VARIATION_Filter_to_Nocall.vcf.gz mv $input_dir/5_3_NO_VARIATION_Mark_Flag.vcf.gz.tbi $out_dir/6_NO_VARIATION_Filter_to_Nocall.vcf.gz.tbi
3.7 MergeVcfs After Filtering
Reference script:
input_dir=../6_Set_filtered_gt_to_nocall/6_Filter_to_Nocall out_dir=./MergeVcfs_After_Filtering [ -d $out_dir ] || mkdir -p $out_dir gatk MergeVcfs \ -I $input_dir/6_SNP_Filter_to_Nocall.vcf.gz \ -I $input_dir/6_INDEL_Filter_to_Nocall.vcf.gz \ -I $input_dir/6_MIXED_Filter_to_Nocall.vcf.gz \ -O $out_dir/7_Merge_Filter_to_Nocall.vcf.gz
3.8 Select only variants
3.8.1 Reference script:
input_file=../7_MergeVcfs_After_Filtering/MergeVcfs_After_Filtering/7_Merge_Filter_to_Nocall.vcf.gz out_dir=./8_1_Split_by_Chrom intervals=./intervals_without_mtDNA.list [ -d $out_dir ] || mkdir -p $out_dir parallel --jobs 16 -a $intervals --header : --colsep ',' \ 'gatk SelectVariants \ -V {input_file} \ --intervals {intervals} \ --exclude-non-variants true \ --exclude-filtered true \ -O {out_dir}/8_1_{intervals}.vcf.gz' \ ::: input_file $input_file \ ::: out_dir $out_dir
3.8.2 Reference script:
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna input=./your_3.8.1_VCF_path.list output_dir=./8_Only_variants output_file=8_Only_variants.vcf.gz out="$output_dir/$output_file" [ -d $output_dir ] || mkdir -p $output_dir gatk MergeVcfs \ --INPUT $input \ --OUTPUT $out \ --REFERENCE_SEQUENCE $ref_path
3.9 Select filtered Variants
Reference script:
input_file=../8_Select_only_variants/8_Only_variants/8_Only_variants.vcf.gz out_dir=./Select_filtered_Variants [ -d $out_dir ] || mkdir -p $out_dir type="SNP INDEL MIXED MNP SYMBOLIC" parallel --jobs 6 --header : --colsep ',' \ 'gatk SelectVariants \ -select-type {type} \ -V {input_file} \ -O {out_dir}/9_{type}.vcf.gz' \ ::: input_file $input_file \ ::: out_dir $out_dir \ ::: type $type