1. Quality Control


1.1 SRA to fastq


Software:

fastq-dump in SRAToolkit

Software download:

https://github.com/ncbi/sra-tools

Reference script:

out_dir=./your_sra_folder
[ -d "$out_dir" ] || mkdir -p "$out_dir"

for i in $(ls ./your_sra_folder/*.sra);
	do
	echo $i
	/absolute_path_of_fastq-dump "$i" \
	--split-3 --gzip -O "$out_dir";
	done   
                

1.2 Fastp


Software:

fastp

Software download:

https://github.com/OpenGene/fastp

Reference script:


fastp=/absolute_path_of_fastp
adapters=/absolute_path_of_S.cerevisiae_adapters_with_BGI.fa

input_dir=./your_fastq_folder
out_dir=./fastq_after_fastp

sample_id=$(find $input_dir -maxdepth 1 -iname "*_1.fastq.gz" -exec basename {} _1.fastq.gz \; | sort | uniq)

parallel --jobs 5 --header : --colsep ' ' \
	'
	out_sub_dir={out_dir}/{sample_id}
	[ -d $out_sub_dir ] || mkdir -p $out_sub_dir
	{fastp} \
      -i {input_dir}/{sample_id}_1.fastq.gz \
      -I {input_dir}/{sample_id}_2.fastq.gz \
      -o $out_sub_dir/{sample_id}.clean.R1.fq.gz \
      -O $out_sub_dir/{sample_id}.clean.R2.fq.gz \
      --adapter_fasta {adapters} \
      --thread 5
	' \
  ::: sample_id $sample_id \
  ::: input_dir $input_dir \
  ::: out_dir $out_dir \
  ::: adapters $adapters \
  ::: fastp $fastp

1.3 Remove adapters


Software:

bbduk in bbmap

Software download:

https://sourceforge.net/projects/bbmap/

Reference script:

outbbduk=/absolute_path_of_bbmap/bbduk.sh
adapters=/absolute_path_of_S.cerevisiae_adapters_with_BGI.fa

input_dir=../your_fastq_after_fastp_folder
out_dir=./fastq_after_rmv_adapters
file_path=$(find $input_dir -maxdepth 2 -iname "*.fastp.html" )

parallel --jobs 8 --header : --colsep ',' \
	'
	sub_dir=$(dirname {file_path}  | awk -F"/" "{print \$NF}")
	sub_input_dir={input_dir}/$sub_dir
	sub_out_dir={out_dir}/$sub_dir
	[ -d $sub_out_dir ] || mkdir -p $sub_out_dir
	file_name=$(basename -s .fastp.html {file_path})
	
	{bbduk} \
	in1=$sub_input_dir/"$file_name".clean.R1.fq.gz \
	in2=$sub_input_dir/"$file_name".clean.R2.fq.gz \
	out1=$sub_out_dir/"$file_name".clean.R1.fq.gz \
	out2=$sub_out_dir/"$file_name".clean.R2.fq.gz \
	ref={adapters} \
	threads=8
	' \
  ::: input_dir $input_dir \
  ::: out_dir $out_dir \
  ::: file_path $file_path \
  ::: adapters $adapters \
  ::: bbduk $bbduk 
                

1.4 Remove phiX


Software:

bbduk in bbmap

Software download:

https://sourceforge.net/projects/bbmap/

Reference script:

bbduk=/absolute_path_of_bbmap/bbduk.sh
phiX=/absolute_path_of_S.cerevisiae_phiX174.fasta

input_dir=../your_fastq_after_rmv_adapters_folder
out_dir=../fastq_after_filter
file_path=$(find $input_dir -maxdepth 2 -iname "*.paired_refstats.txt" )

parallel --jobs 8 --header : --colsep ',' \
	'
	sub_dir=$(dirname {file_path}  | awk -F"/" "{print \$NF}")
	sub_input_dir={input_dir}/$sub_dir
	sub_out_dir={out_dir}/$sub_dir
	[ -d $sub_out_dir ] || mkdir -p $sub_out_dir
	file_name=$(basename -s .paired_refstats.txt {file_path})
	
	{bbduk} \
	in1=$sub_input_dir/"$file_name".clean.R1.fq.gz \
	in2=$sub_input_dir/"$file_name".clean.R2.fq.gz \
	out1=$sub_out_dir/"$file_name".clean.R1.fq.gz \
	out2=$sub_out_dir/"$file_name".clean.R2.fq.gz \
	ref={phiX} \
	threads=8
	' \
  ::: input_dir $input_dir \
  ::: out_dir $out_dir \
  ::: file_path $file_path \
  ::: phiX $phiX \
  ::: bbduk $bbduk 
                

2. Bam


Software:

bwa-mem2、samtools

Software download:

https://github.com/bwa-mem2/bwa-mem2

http://www.htslib.org/download/

Reference script:

bwa_mem2=/absolute_path_of_bwa-mem2
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
input_dir=../your_fastq_after_filter_folder
out_dir=./1_after_bam
select_list=./your_strain_name_list.csv

parallel --jobs 4 -a $select_list --header : --colsep ',' \
	'
	out_dir_sub={out_dir}/{SM}
	[ -d "$out_dir_sub" ] || mkdir -p "$out_dir_sub"
	
	out_Paired_file="$out_dir_sub"/{SM}_Paired.bam
	{bwa_mem2} mem -t 16 -R "@RG\tID:{ID}\tPL:illumina\tSM:{SM}" \
	{ref_path} \
	{input_dir}/{Name}/{Name}.clean.R1.fq.gz   \
	{input_dir}/{Name}/{Name}.clean.R2.fq.gz | \
	samtools view -b - | \
	samtools sort - | \
	samtools markdup -r - "$out_Paired_file"
	samtools index "$out_Paired_file"

	out_Unpaired_file="$out_dir_sub"/{SM}_Unpaired.bam
	{bwa_mem2} mem -t 16 -R "@RG\tID:{ID}\tPL:illumina\tSM:{SM}" \
	{ref_path} \
	{input_dir}/{Name}/{Name}.unpaired.fq.gz | \
	samtools view -b - | \
	samtools sort - | \
	samtools markdup -r - "$out_Unpaired_file"
	samtools index "$out_Unpaired_file"
	' \
  ::: input_dir $input_dir \
  ::: out_dir $out_dir \
  ::: ref_path $ref_path \
  ::: bwa_mem2 $bwa_mem2 
                

3. Variation detection


Software:

samtools、parallel、GATK(Genome Analysis Toolkit)

Software download:

http://www.htslib.org/download/

https://gatk.broadinstitute.org/hc/en-us

Ubuntu install parallel:sudo apt-get install parallel

3.1 HaplotypeCaller


Reference script:

input_dir=../your_1_after_bam_folder
out_dir=./gvcf
filename=$(find $input_dir -maxdepth 2 -iname "*_Paired.bam" | sort )
ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna

parallel --jobs 10 --header : --colsep ',' \
	'
	filename_pre=$(basename -s .mk_dup.bam {filename})
	out_file={out_dir}/"$filename_pre".g.vcf.gz
	
	gatk HaplotypeCaller	\
	-R {ref_path}	\
	-I {filename}	\
	-O $out_file	\
	-ERC GVCF
	' \
	::: out_dir $out_dir \
	::: filename $filename \
	::: ref_path $ref_path   
                

3.2 GenomicsDBImport


Reference script:

out_file=2_2_GenomicsDBImport.sh
if [ -f $out_file ]; then rm $out_file; fi && touch $out_file

echo "#!/bin/bash
gatk GenomicsDBImport \\
	--genomicsdb-workspace-path ./Genomics_Database \\
	--batch-size 300 \\
	--reader-threads 60 \\
	--intervals GenomicsDBImport.intervals \\
	--tmp-dir ./tmp_dir" >> $out_file

ls /gvcf_folder_generated_by_HaplotypeCaller/gvcf/*vcf | sed "s:^:\t-V :" | sed "s:$: \\\:" >> $out_file

sed -i '$s/ \\$//' $out_file
chmod u+x $out_file   
                

3.3 GenotypeGVCFs all site


3.3.1 Reference script:

ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
variant_dir=../2_GenomicsDBImport/Genomics_Database
out_dir=./3_1_Chr_Split
intervals=./intervals_without_mtDNA.list

parallel --jobs 7 -a $intervals --header : --colsep ',' \
	'
	gatk GenotypeGVCFs \
	--reference {ref_path} \
	--include-non-variant-sites \
	--variant gendb://{variant_dir} \
	--intervals {intervals} \
	--output {out_dir}/3_1_{intervals}.vcf.gz
	' \
	::: ref_path $ref_path	\
	::: variant_dir $variant_dir \
	::: out_dir $out_dir  
                

3.3.2 Reference script:

ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
input=./your_3.3.1_VCF_path.list
output_dir=./3_3_GenotypeGVCFs_all_site
output_file=3_3_GenotypeGVCFs_all_site.vcf.gz

gatk MergeVcfs \
	--INPUT $input \
	--OUTPUT "$output_dir/$output_file" \
	--REFERENCE_SEQUENCE $ref_path   
                

3.4 SelectVariants


Reference script:

input_file=../3_GenotypeGVCFs_all_site/3_3_GenotypeGVCFs_all_site/3_3_GenotypeGVCFs_all_site.vcf.gz
out_dir=./4_SelectVariants
type="SNP INDEL MIXED MNP SYMBOLIC"

parallel --jobs 6 --header : --colsep ',' \
	'gatk SelectVariants \
	-select-type {type} \
	-V {input_file} \
	-O {out_dir}/4_{type}.vcf.gz' \
	::: input_file $input_file \
	::: out_dir $out_dir \
	::: type $type   
                

3.5 VariantFiltration Mark


3.5.1 Reference script:

input_file=../4_SelectVariants/4_SNP.vcf.gz
out_dir=./5_1_Split_SNPS_by_Chrom_Original
intervals=./intervals_without_mtDNA.list

parallel --jobs 6 -a $intervals --header : --colsep ',' \
	'gatk SelectVariants \
	--intervals {intervals} \
	-V {input_file} \
	-O {out_dir}/{intervals}_SNP.vcf.gz' \
	::: input_file $input_file \
	::: out_dir $out_dir 
                

3.5.2 Reference script:

input_dir=./5_1_Split_SNPS_by_Chrom_Original
file_path=$(find $input_dir -maxdepth 1 -iname "*.vcf.gz" )
out_dir=./5_2_Split_SNPS_by_Chrom_Mark

parallel --jobs 6 --header : --colsep ' ' \
	'
	intervals=$(basename -s _SNP.vcf.gz {file_path})
	gatk VariantFiltration \
	-V {file_path} \
    -filter "QD < 2.0" --filter-name "QD2" \
    -filter "QUAL < 30.0" --filter-name "QUAL30" \
    -filter "SOR > 3.0" --filter-name "SOR3" \
    -filter "FS > 60.0" --filter-name "FS60" \
    -filter "MQ < 40.0" --filter-name "MQ40" \
    -filter "MQRankSum < -12.5" --filter-name "MQRankSum-12.5" \
    -filter "ReadPosRankSum < -8.0" --filter-name "ReadPosRankSum-8" \
	-O {out_dir}/"$intervals"_SNP_Mark.vcf.gz
	' \
	::: file_path $file_path \
	::: out_dir $out_dir   
                

3.5.3 Reference script:

input_dir=../4_SelectVariants/4_SelectVariants
out_dir=./5_3_VariantFiltration_Mark

# Indel hard-filtering
gatk VariantFiltration \
	-V $input_dir/4_INDEL.vcf.gz \
    -filter "QD < 2.0" --filter-name "QD2" \
    -filter "QUAL < 30.0" --filter-name "QUAL30" \
    -filter "FS > 200.0" --filter-name "FS200" \
    -filter "ReadPosRankSum < -20.0" --filter-name "ReadPosRankSum-20" \
	-O $out_dir/6_3_INDEL_Mark_Flag.vcf.gz

# Mixed hard-filtering
gatk VariantFiltration \
	-V $input_dir/4_MIXED.vcf.gz \
    -filter "QD < 2.0" --filter-name "QD2" \
    -filter "QUAL < 30.0" --filter-name "QUAL30" \
    -filter "FS > 200.0" --filter-name "FS200" \
    -filter "ReadPosRankSum < -20.0" --filter-name "ReadPosRankSum-20" \
	-O $out_dir/6_3_MIXED_Mark_Flag.vcf.gz  
                

3.6 Set filtered gt to nocall


3.6.1 Reference script:

input_dir=../5_VariantFiltration_Mark/5_2_Split_SNPS_by_Chrom_Mark
file_path=$(find $input_dir -maxdepth 1 -iname "*.vcf.gz" )
out_dir=./6_1_Split_SNPS_by_Chrom
[ -d $out_dir ] || mkdir -p $out_dir

parallel --jobs 16 --header : --colsep ',' \
	'
	intervals=$(basename -s _SNP_Mark.vcf.gz {file_path})
	gatk SelectVariants \
	-V {file_path} \
	--set-filtered-gt-to-nocall \
	-O {out_dir}/"$intervals"_SNP_Filter_to_Nocall.vcf.gz
	' \
	::: file_path $file_path \
	::: out_dir $out_dir 
                

3.6.2 Reference script:

ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
input=./your_3.6.1_VCF_path.list
output_dir=./6_Filter_to_Nocall
output_file=6_SNP_Filter_to_Nocall.vcf.gz
out="$output_dir/$output_file"
[ -d $output_dir ] || mkdir -p $output_dir

gatk MergeVcfs \
	--INPUT $input \
	--OUTPUT $out \
	--REFERENCE_SEQUENCE $ref_path   
                

3.6.3 Reference script:

input_dir=../5_VariantFiltration_Mark/5_3_VariantFiltration_Mark
out_dir=./6_Filter_to_Nocall
[ -d $out_dir ] || mkdir -p $out_dir

type="INDEL MIXED"

parallel --jobs 3 --header : --colsep ',' \
	'gatk SelectVariants \
	-V {input_dir}/5_3_{type}_Mark_Flag.vcf.gz \
	--set-filtered-gt-to-nocall \
	-O {out_dir}/6_{type}_Filter_to_Nocall.vcf.gz' \
	::: input_dir $input_dir \
	::: out_dir $out_dir \
	::: type $type

mv $input_dir/5_3_NO_VARIATION_Mark_Flag.vcf.gz $out_dir/6_NO_VARIATION_Filter_to_Nocall.vcf.gz
mv $input_dir/5_3_NO_VARIATION_Mark_Flag.vcf.gz.tbi $out_dir/6_NO_VARIATION_Filter_to_Nocall.vcf.gz.tbi 
                

3.7 MergeVcfs After Filtering


Reference script:

input_dir=../6_Set_filtered_gt_to_nocall/6_Filter_to_Nocall
out_dir=./MergeVcfs_After_Filtering
[ -d $out_dir ] || mkdir -p $out_dir

gatk MergeVcfs \
	-I $input_dir/6_SNP_Filter_to_Nocall.vcf.gz \
	-I $input_dir/6_INDEL_Filter_to_Nocall.vcf.gz \
	-I $input_dir/6_MIXED_Filter_to_Nocall.vcf.gz \
	-O $out_dir/7_Merge_Filter_to_Nocall.vcf.gz   
                

3.8 Select only variants


3.8.1 Reference script:

input_file=../7_MergeVcfs_After_Filtering/MergeVcfs_After_Filtering/7_Merge_Filter_to_Nocall.vcf.gz
out_dir=./8_1_Split_by_Chrom
intervals=./intervals_without_mtDNA.list
[ -d $out_dir ] || mkdir -p $out_dir

parallel --jobs 16 -a $intervals --header : --colsep ',' \
	'gatk SelectVariants \
	-V {input_file} \
	--intervals {intervals} \
	--exclude-non-variants true \
	--exclude-filtered true \
	-O {out_dir}/8_1_{intervals}.vcf.gz' \
	::: input_file $input_file \
	::: out_dir $out_dir 
                

3.8.2 Reference script:

ref_path=/absolute_path_of_S_cerevisiae_S288C_R64.fna
input=./your_3.8.1_VCF_path.list
output_dir=./8_Only_variants
output_file=8_Only_variants.vcf.gz
out="$output_dir/$output_file"
[ -d $output_dir ] || mkdir -p $output_dir

gatk MergeVcfs \
	--INPUT $input \
	--OUTPUT $out \
	--REFERENCE_SEQUENCE $ref_path   
                

3.9 Select filtered Variants


Reference script:

input_file=../8_Select_only_variants/8_Only_variants/8_Only_variants.vcf.gz
out_dir=./Select_filtered_Variants
[ -d $out_dir ] || mkdir -p $out_dir

type="SNP INDEL MIXED MNP SYMBOLIC"

parallel --jobs 6 --header : --colsep ',' \
	'gatk SelectVariants \
	-select-type {type} \
	-V {input_file} \
	-O {out_dir}/9_{type}.vcf.gz' \
	::: input_file $input_file \
	::: out_dir $out_dir \
	::: type $type