1. Quality Control
1.1 SRA to fastq
Software:
fastq-dump in SRAToolkit
Software download:
https://github.com/ncbi/sra-tools
Reference script:
out_dir=./your_sra_folder
[ -d "$out_dir" ] || mkdir -p "$out_dir"
for i in $(ls ./your_sra_folder/*.sra);
do
echo $i
/absolute_path_of_fastq-dump "$i" \
--split-3 --gzip -O "$out_dir";
done
1.2 Fastp
Software:
fastp
Software download:
https://github.com/OpenGene/fastp
Reference script:
fastp=/absolute_path_of_fastp
adapters=/absolute_path_of_S.cerevisiae_adapters_with_BGI.fa
input_dir=./your_fastq_folder
out_dir=./fastq_after_fastp
sample_id=$(find $input_dir -maxdepth 1 -iname "*_1.fastq.gz" -exec basename {} _1.fastq.gz \; | sort | uniq)
parallel --jobs 5 --header : --colsep ' ' \
'
out_sub_dir={out_dir}/{sample_id}
[ -d $out_sub_dir ] || mkdir -p $out_sub_dir
{fastp} \
-i {input_dir}/{sample_id}_1.fastq.gz \
-I {input_dir}/{sample_id}_2.fastq.gz \
-o $out_sub_dir/{sample_id}.clean.R1.fq.gz \
-O $out_sub_dir/{sample_id}.clean.R2.fq.gz \
--adapter_fasta {adapters} \
--thread 5
' \
::: sample_id $sample_id \
::: input_dir $input_dir \
::: out_dir $out_dir \
::: adapters $adapters \
::: fastp $fastp
1.3 Remove adapters
Software:
bbduk in bbmap
Software download:
https://sourceforge.net/projects/bbmap/
Reference script:
outbbduk=/absolute_path_of_bbmap/bbduk.sh
adapters=/absolute_path_of_S.cerevisiae_adapters_with_BGI.fa
input_dir=../your_fastq_after_fastp_folder
out_dir=./fastq_after_rmv_adapters
file_path=$(find $input_dir -maxdepth 2 -iname "*.fastp.html" )
parallel --jobs 8 --header : --colsep ',' \
'
sub_dir=$(dirname {file_path} | awk -F"/" "{print \$NF}")
sub_input_dir={input_dir}/$sub_dir
sub_out_dir={out_dir}/$sub_dir
[ -d $sub_out_dir ] || mkdir -p $sub_out_dir
file_name=$(basename -s .fastp.html {file_path})
{bbduk} \
in1=$sub_input_dir/"$file_name".clean.R1.fq.gz \
in2=$sub_input_dir/"$file_name".clean.R2.fq.gz \
out1=$sub_out_dir/"$file_name".clean.R1.fq.gz \
out2=$sub_out_dir/"$file_name".clean.R2.fq.gz \
ref={adapters} \
threads=8
' \
::: input_dir $input_dir \
::: out_dir $out_dir \
::: file_path $file_path \
::: adapters $adapters \
::: bbduk $bbduk
1.4 Remove phiX
Software:
bbduk in bbmap
Software download:
https://sourceforge.net/projects/bbmap/
Reference script:
bbduk=/absolute_path_of_bbmap/bbduk.sh
phiX=/absolute_path_of_S.cerevisiae_phiX174.fasta
input_dir=../your_fastq_after_rmv_adapters_folder
out_dir=../fastq_after_filter
file_path=$(find $input_dir -maxdepth 2 -iname "*.paired_refstats.txt" )
parallel --jobs 8 --header : --colsep ',' \
'
sub_dir=$(dirname {file_path} | awk -F"/" "{print \$NF}")
sub_input_dir={input_dir}/$sub_dir
sub_out_dir={out_dir}/$sub_dir
[ -d $sub_out_dir ] || mkdir -p $sub_out_dir
file_name=$(basename -s .paired_refstats.txt {file_path})
{bbduk} \
in1=$sub_input_dir/"$file_name".clean.R1.fq.gz \
in2=$sub_input_dir/"$file_name".clean.R2.fq.gz \
out1=$sub_out_dir/"$file_name".clean.R1.fq.gz \
out2=$sub_out_dir/"$file_name".clean.R2.fq.gz \
ref={phiX} \
threads=8
' \
::: input_dir $input_dir \
::: out_dir $out_dir \
::: file_path $file_path \
::: phiX $phiX \
::: bbduk $bbduk
2. Bam
Software:
bwa-mem2、samtools
Software download:
https://github.com/bwa-mem2/bwa-mem2
http://www.htslib.org/download/
Reference script:
bwa_mem2=/absolute_path_of_bwa-mem2
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
input_dir=../your_fastq_after_filter_folder
out_dir=./1_after_bam
select_list=./your_strain_name_list.csv
parallel --jobs 4 -a $select_list --header : --colsep ',' \
'
out_dir_sub={out_dir}/{SM}
[ -d "$out_dir_sub" ] || mkdir -p "$out_dir_sub"
out_Paired_file="$out_dir_sub"/{SM}_Paired.bam
{bwa_mem2} mem -t 16 -R "@RG\tID:{ID}\tPL:illumina\tSM:{SM}" \
{ref_path} \
{input_dir}/{Name}/{Name}.clean.R1.fq.gz \
{input_dir}/{Name}/{Name}.clean.R2.fq.gz | \
samtools view -b - | \
samtools sort - | \
samtools markdup -r - "$out_Paired_file"
samtools index "$out_Paired_file"
out_Unpaired_file="$out_dir_sub"/{SM}_Unpaired.bam
{bwa_mem2} mem -t 16 -R "@RG\tID:{ID}\tPL:illumina\tSM:{SM}" \
{ref_path} \
{input_dir}/{Name}/{Name}.unpaired.fq.gz | \
samtools view -b - | \
samtools sort - | \
samtools markdup -r - "$out_Unpaired_file"
samtools index "$out_Unpaired_file"
' \
::: input_dir $input_dir \
::: out_dir $out_dir \
::: ref_path $ref_path \
::: bwa_mem2 $bwa_mem2
3. Variation detection
Software:
samtools、parallel、GATK(Genome Analysis Toolkit)
Software download:
http://www.htslib.org/download/
https://gatk.broadinstitute.org/hc/en-us
Ubuntu install parallel:sudo apt-get install parallel
3.1 HaplotypeCaller
Reference script:
input_dir=../your_1_after_bam_folder
out_dir=./gvcf
filename=$(find $input_dir -maxdepth 2 -iname "*_Paired.bam" | sort )
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
parallel --jobs 10 --header : --colsep ',' \
'
filename_pre=$(basename -s .mk_dup.bam {filename})
out_file={out_dir}/"$filename_pre".g.vcf.gz
gatk HaplotypeCaller \
-R {ref_path} \
-I {filename} \
-O $out_file \
-ERC GVCF
' \
::: out_dir $out_dir \
::: filename $filename \
::: ref_path $ref_path
3.2 GenomicsDBImport
Reference script:
out_file=2_2_GenomicsDBImport.sh
if [ -f $out_file ]; then rm $out_file; fi && touch $out_file
echo "#!/bin/bash
gatk GenomicsDBImport \\
--genomicsdb-workspace-path ./Genomics_Database \\
--batch-size 300 \\
--reader-threads 60 \\
--intervals GenomicsDBImport.intervals \\
--tmp-dir ./tmp_dir" >> $out_file
ls /gvcf_folder_generated_by_HaplotypeCaller/gvcf/*vcf | sed "s:^:\t-V :" | sed "s:$: \\\:" >> $out_file
sed -i '$s/ \\$//' $out_file
chmod u+x $out_file
3.3 GenotypeGVCFs all site
3.3.1 Reference script:
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
variant_dir=../2_GenomicsDBImport/Genomics_Database
out_dir=./3_1_Chr_Split
intervals=./intervals_without_mtDNA.list
parallel --jobs 7 -a $intervals --header : --colsep ',' \
'
gatk GenotypeGVCFs \
--reference {ref_path} \
--include-non-variant-sites \
--variant gendb://{variant_dir} \
--intervals {intervals} \
--output {out_dir}/3_1_{intervals}.vcf.gz
' \
::: ref_path $ref_path \
::: variant_dir $variant_dir \
::: out_dir $out_dir
3.3.2 Reference script:
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
input=./your_3.3.1_VCF_path.list
output_dir=./3_3_GenotypeGVCFs_all_site
output_file=3_3_GenotypeGVCFs_all_site.vcf.gz
gatk MergeVcfs \
--INPUT $input \
--OUTPUT "$output_dir/$output_file" \
--REFERENCE_SEQUENCE $ref_path
3.4 SelectVariants
Reference script:
input_file=../3_GenotypeGVCFs_all_site/3_3_GenotypeGVCFs_all_site/3_3_GenotypeGVCFs_all_site.vcf.gz
out_dir=./4_SelectVariants
type="SNP INDEL MIXED MNP SYMBOLIC"
parallel --jobs 6 --header : --colsep ',' \
'gatk SelectVariants \
-select-type {type} \
-V {input_file} \
-O {out_dir}/4_{type}.vcf.gz' \
::: input_file $input_file \
::: out_dir $out_dir \
::: type $type
3.5 VariantFiltration Mark
3.5.1 Reference script:
input_file=../4_SelectVariants/4_SNP.vcf.gz
out_dir=./5_1_Split_SNPS_by_Chrom_Original
intervals=./intervals_without_mtDNA.list
parallel --jobs 6 -a $intervals --header : --colsep ',' \
'gatk SelectVariants \
--intervals {intervals} \
-V {input_file} \
-O {out_dir}/{intervals}_SNP.vcf.gz' \
::: input_file $input_file \
::: out_dir $out_dir
3.5.2 Reference script:
input_dir=./5_1_Split_SNPS_by_Chrom_Original
file_path=$(find $input_dir -maxdepth 1 -iname "*.vcf.gz" )
out_dir=./5_2_Split_SNPS_by_Chrom_Mark
parallel --jobs 6 --header : --colsep ' ' \
'
intervals=$(basename -s _SNP.vcf.gz {file_path})
gatk VariantFiltration \
-V {file_path} \
-filter "QD < 2.0" --filter-name "QD2" \
-filter "QUAL < 30.0" --filter-name "QUAL30" \
-filter "SOR > 3.0" --filter-name "SOR3" \
-filter "FS > 60.0" --filter-name "FS60" \
-filter "MQ < 40.0" --filter-name "MQ40" \
-filter "MQRankSum < -12.5" --filter-name "MQRankSum-12.5" \
-filter "ReadPosRankSum < -8.0" --filter-name "ReadPosRankSum-8" \
-O {out_dir}/"$intervals"_SNP_Mark.vcf.gz
' \
::: file_path $file_path \
::: out_dir $out_dir
3.5.3 Reference script:
input_dir=../4_SelectVariants/4_SelectVariants
out_dir=./5_3_VariantFiltration_Mark
# Indel hard-filtering
gatk VariantFiltration \
-V $input_dir/4_INDEL.vcf.gz \
-filter "QD < 2.0" --filter-name "QD2" \
-filter "QUAL < 30.0" --filter-name "QUAL30" \
-filter "FS > 200.0" --filter-name "FS200" \
-filter "ReadPosRankSum < -20.0" --filter-name "ReadPosRankSum-20" \
-O $out_dir/6_3_INDEL_Mark_Flag.vcf.gz
# Mixed hard-filtering
gatk VariantFiltration \
-V $input_dir/4_MIXED.vcf.gz \
-filter "QD < 2.0" --filter-name "QD2" \
-filter "QUAL < 30.0" --filter-name "QUAL30" \
-filter "FS > 200.0" --filter-name "FS200" \
-filter "ReadPosRankSum < -20.0" --filter-name "ReadPosRankSum-20" \
-O $out_dir/6_3_MIXED_Mark_Flag.vcf.gz
3.6 Set filtered gt to nocall
3.6.1 Reference script:
input_dir=../5_VariantFiltration_Mark/5_2_Split_SNPS_by_Chrom_Mark
file_path=$(find $input_dir -maxdepth 1 -iname "*.vcf.gz" )
out_dir=./6_1_Split_SNPS_by_Chrom
[ -d $out_dir ] || mkdir -p $out_dir
parallel --jobs 16 --header : --colsep ',' \
'
intervals=$(basename -s _SNP_Mark.vcf.gz {file_path})
gatk SelectVariants \
-V {file_path} \
--set-filtered-gt-to-nocall \
-O {out_dir}/"$intervals"_SNP_Filter_to_Nocall.vcf.gz
' \
::: file_path $file_path \
::: out_dir $out_dir
3.6.2 Reference script:
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
input=./your_3.6.1_VCF_path.list
output_dir=./6_Filter_to_Nocall
output_file=6_SNP_Filter_to_Nocall.vcf.gz
out="$output_dir/$output_file"
[ -d $output_dir ] || mkdir -p $output_dir
gatk MergeVcfs \
--INPUT $input \
--OUTPUT $out \
--REFERENCE_SEQUENCE $ref_path
3.6.3 Reference script:
input_dir=../5_VariantFiltration_Mark/5_3_VariantFiltration_Mark
out_dir=./6_Filter_to_Nocall
[ -d $out_dir ] || mkdir -p $out_dir
type="INDEL MIXED"
parallel --jobs 3 --header : --colsep ',' \
'gatk SelectVariants \
-V {input_dir}/5_3_{type}_Mark_Flag.vcf.gz \
--set-filtered-gt-to-nocall \
-O {out_dir}/6_{type}_Filter_to_Nocall.vcf.gz' \
::: input_dir $input_dir \
::: out_dir $out_dir \
::: type $type
mv $input_dir/5_3_NO_VARIATION_Mark_Flag.vcf.gz $out_dir/6_NO_VARIATION_Filter_to_Nocall.vcf.gz
mv $input_dir/5_3_NO_VARIATION_Mark_Flag.vcf.gz.tbi $out_dir/6_NO_VARIATION_Filter_to_Nocall.vcf.gz.tbi
3.7 MergeVcfs After Filtering
Reference script:
input_dir=../6_Set_filtered_gt_to_nocall/6_Filter_to_Nocall
out_dir=./MergeVcfs_After_Filtering
[ -d $out_dir ] || mkdir -p $out_dir
gatk MergeVcfs \
-I $input_dir/6_SNP_Filter_to_Nocall.vcf.gz \
-I $input_dir/6_INDEL_Filter_to_Nocall.vcf.gz \
-I $input_dir/6_MIXED_Filter_to_Nocall.vcf.gz \
-O $out_dir/7_Merge_Filter_to_Nocall.vcf.gz
3.8 Select only variants
3.8.1 Reference script:
input_file=../7_MergeVcfs_After_Filtering/MergeVcfs_After_Filtering/7_Merge_Filter_to_Nocall.vcf.gz
out_dir=./8_1_Split_by_Chrom
intervals=./intervals_without_mtDNA.list
[ -d $out_dir ] || mkdir -p $out_dir
parallel --jobs 16 -a $intervals --header : --colsep ',' \
'gatk SelectVariants \
-V {input_file} \
--intervals {intervals} \
--exclude-non-variants true \
--exclude-filtered true \
-O {out_dir}/8_1_{intervals}.vcf.gz' \
::: input_file $input_file \
::: out_dir $out_dir
3.8.2 Reference script:
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
input=./your_3.8.1_VCF_path.list
output_dir=./8_Only_variants
output_file=8_Only_variants.vcf.gz
out="$output_dir/$output_file"
[ -d $output_dir ] || mkdir -p $output_dir
gatk MergeVcfs \
--INPUT $input \
--OUTPUT $out \
--REFERENCE_SEQUENCE $ref_path
3.9 Select filtered Variants
Reference script:
input_file=../8_Select_only_variants/8_Only_variants/8_Only_variants.vcf.gz
out_dir=./Select_filtered_Variants
[ -d $out_dir ] || mkdir -p $out_dir
type="SNP INDEL MIXED MNP SYMBOLIC"
parallel --jobs 6 --header : --colsep ',' \
'gatk SelectVariants \
-select-type {type} \
-V {input_file} \
-O {out_dir}/9_{type}.vcf.gz' \
::: input_file $input_file \
::: out_dir $out_dir \
::: type $type