Difference between revisions of "UGP Variant Pipeline 1.3.0"
(One intermediate revision by the same user not shown) | |||
Line 1: | Line 1: | ||
= Utah Genome Project = | = Utah Genome Project = | ||
− | + | May. 2015 | |
Variant Calling Pipeline Version 1.3.0 | Variant Calling Pipeline Version 1.3.0 | ||
Line 8: | Line 8: | ||
*BWA: 0.7.10 | *BWA: 0.7.10 | ||
*GATK: 3.3-0 | *GATK: 3.3-0 | ||
− | *SamTools: 0. | + | *SamTools: 0.1.19 |
*Samblaster | *Samblaster | ||
*Sambamba | *Sambamba | ||
Line 79: | Line 79: | ||
* Try the download again. | * Try the download again. | ||
* Contact the sequence provider. | * Contact the sequence provider. | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
== Indexing == | == Indexing == | ||
Line 109: | Line 92: | ||
*SamTools | *SamTools | ||
samtools faidx human_g1k_v37_decoy.fasta | samtools faidx human_g1k_v37_decoy.fasta | ||
+ | |||
+ | == FastQ File Analyses == | ||
+ | |||
+ | fastqc --threads # -o <output dir> -f fastq fastq-file.fq.gz 2> fastqc_run.log-# | ||
== Alignment == | == Alignment == | ||
Line 122: | Line 109: | ||
*idxstats | *idxstats | ||
− | samtools idxstats [dedup bam files] > dedup-bamfile.stats | + | samtools idxstats [sorted dedup bam files] > dedup-bamfile.stats |
*flagstat | *flagstat | ||
Line 215: | Line 202: | ||
''Collect all individual gvcf files.'' | ''Collect all individual gvcf files.'' | ||
− | ''These will be the files kept for future GenotypeGVCF re-runs.'' | + | ''These will be the files kept for GenotypeGVCF and future GenotypeGVCF re-runs.'' |
java -cp GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants | java -cp GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants | ||
Line 233: | Line 220: | ||
--variant *.raw.snps.indels.gCat.vcf | --variant *.raw.snps.indels.gCat.vcf | ||
--variant [ ... ] | --variant [ ... ] | ||
− | -o cApTUrE_* | + | -o cApTUrE_*_mergeGvcf.vcf |
&> CombineGVCF.log-* | &> CombineGVCF.log-* | ||
− | + | *GenotypeGVCFs | |
java -jar -Xmx#g -XX:ParallelGCThreads=# GenomeAnalysisTK.jar | java -jar -Xmx#g -XX:ParallelGCThreads=# GenomeAnalysisTK.jar | ||
Line 242: | Line 229: | ||
-R human_g1k_v37_decoy.fasta | -R human_g1k_v37_decoy.fasta | ||
--num_threads # | --num_threads # | ||
− | --variant cApTUrE_* | + | --variant [cApTUrE_*_mergeGvcf.vcf or <individual>.raw.snps.indels.gCat.vcf] |
--variant CEU_mergeGvcf.vcf | --variant CEU_mergeGvcf.vcf | ||
--variant GBR_mergeGvcf.vcf | --variant GBR_mergeGvcf.vcf | ||
--variant FIN_mergeGvcf.vcf | --variant FIN_mergeGvcf.vcf | ||
− | -L chr#_region_file.list | + | -L chr#_region_file.list |
− | -o | + | -o chr#_region_genotyped.vcf |
&> GenotypeGVCF.log-# | &> GenotypeGVCF.log-# | ||
− | + | *Combine_Genotyped | |
− | |||
''Collect all the individual genotype steps.'' | ''Collect all the individual genotype steps.'' | ||
Line 257: | Line 243: | ||
-R human_g1k_v37_decoy.fasta | -R human_g1k_v37_decoy.fasta | ||
--assumeSorted | --assumeSorted | ||
− | -V | + | -V chr#_region_genotyped.vcf |
-V [ ... ] | -V [ ... ] | ||
− | -out cApTUrE_* | + | -out cApTUrE_*_cat_genotyped.vcf |
&> Combine_Genotyped.log-# | &> Combine_Genotyped.log-# | ||
− | + | == VariantRecalibrator == | |
*SNP Recalibration | *SNP Recalibration | ||
Line 279: | Line 265: | ||
-an FS | -an FS | ||
-an InbreedingCoeff | -an InbreedingCoeff | ||
− | -input cApTUrE_* | + | -input cApTUrE_*_cat_genotyped.vcf |
-recalFile cApTUrE_*_snp_recal | -recalFile cApTUrE_*_snp_recal | ||
-tranchesFile cApTUrE_*_snp_tranches | -tranchesFile cApTUrE_*_snp_tranches | ||
Line 298: | Line 284: | ||
-an ReadPosRankSum | -an ReadPosRankSum | ||
-an FS | -an FS | ||
− | -input cApTUrE_* | + | -input cApTUrE_*_cat_genotyped.vcf |
-recalFile cApTUrE_*_indel_recal | -recalFile cApTUrE_*_indel_recal | ||
-tranchesFile cApTUrE_*_indel_tranches | -tranchesFile cApTUrE_*_indel_tranches | ||
Line 305: | Line 291: | ||
&> VariantRecalibrator_INDEL.log-# | &> VariantRecalibrator_INDEL.log-# | ||
− | + | == ApplyRecalibration == | |
*SNP Apply | *SNP Apply | ||
Line 315: | Line 301: | ||
--excludeFiltered | --excludeFiltered | ||
--num_threads # | --num_threads # | ||
− | -input cApTUrE_* | + | -input cApTUrE_*_cat_genotyped.vcf |
-recalFile cApTUrE_*_snp_recal | -recalFile cApTUrE_*_snp_recal | ||
-tranchesFile cApTUrE_*_snp_tranches | -tranchesFile cApTUrE_*_snp_tranches | ||
Line 330: | Line 316: | ||
--excludeFiltered | --excludeFiltered | ||
--num_threads # | --num_threads # | ||
− | -input cApTUrE_* | + | -input cApTUrE_*_cat_genotyped.vcf |
-recalFile cApTUrE_*_indel_recal | -recalFile cApTUrE_*_indel_recal | ||
-tranchesFile cApTUrE_*_indel_tranches | -tranchesFile cApTUrE_*_indel_tranches | ||
Line 351: | Line 337: | ||
-o cApTUrE_*_Combined.vcf | -o cApTUrE_*_Combined.vcf | ||
&> CombineVariants.log-* | &> CombineVariants.log-* | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− |
Latest revision as of 18:22, 28 May 2015
Contents
Utah Genome Project
May. 2015 Variant Calling Pipeline Version 1.3.0
Software Versions
- cApTUrE is a lightweight NGS pipeline, created for the Utah Genome Project (UGP)
- BWA: 0.7.10
- GATK: 3.3-0
- SamTools: 0.1.19
- Samblaster
- Sambamba
- FastQC v0.10.1
Data Source
Data sets used for the variant calling pipeline come from the Broad GSA (GATK) group as the 'GATK resource bundle version 2.8
wget -r ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.5/b37
Reference Genome (GRCh37) Now includes phix sequence.
- human_g1k_v37_decoy.fasta
Call region file generated from NCBI
- GRCh37 GFF3
VCF files for RealignerTargetCreator knowns and dbsnp for BaseRecalibrator.
- known_indel: Mills_and_1000G_gold_standard.indels.b37.vcf
- known_indel: 1000G_phase1.indels.b37.vcf
- known_dbsnp: dbsnp_137.b37.vcf
Background Files
- We have created 1000Genomes (BWA mem/GATK 3.0+) background files to be ran concurrently with the GenotypeGVCFs step.
Groups Currently completed:
- CEU
- GBR
- FIN
Version 1.0.5 background files have been updated to show only the indviduals of each group, not the file names.
This is a complete list of the background individuals for run completed > 1.0.5 [1]
If you would like a copy of the current files, we have made a public AWS s3 bucket Using s3cmd execute the following command: s3cmd get s3://ugp-1k-backgrounds --recursive
Alternatively to access the files without s3cmd the following use the following URLs: http://s3-us-west-2.amazonaws.com/ugp-1k-backgrounds/CEU_mergeGvcf.vcf http://s3-us-west-2.amazonaws.com/ugp-1k-backgrounds/CEU_mergeGvcf.vcf.idx http://s3-us-west-2.amazonaws.com/ugp-1k-backgrounds/FIN_mergeGvcf.vcf http://s3-us-west-2.amazonaws.com/ugp-1k-backgrounds/FIN_mergeGvcf.vcf.idx http://s3-us-west-2.amazonaws.com/ugp-1k-backgrounds/GBR_mergeGvcf.vcf http://s3-us-west-2.amazonaws.com/ugp-1k-backgrounds/GBR_mergeGvcf.vcf.idx
Resource files for VariantRecalibrator_SNP
- hapmap_3.3.b37.vcf
- 1000G_omni2.5.b37.vcf
- 1000G_phase1.snps.high_confidence.b37.vcf
Resource files for VariantRecalibrator_INDEL
- Mills_and_1000G_gold_standard.indels.b37.vcf
- 1000G_phase1.indels.b37.vcf
Sequencing
This pipeline is designed for 100 bp (or greater) Illumina HiSeq PE exome or WGS sequence data with Sanger/Illumina 1.9 quality encoding, and uses Illumina naming convention found here [2]
Validate File Integrity with md5sum
An md5sum signature should be provided for each FastQ file by the sequencing center. After the file has been downloaded, locally check the md5sum to be sure that no data corruption occurred during the file transfer.
md5sum file.fastq > file_local.md5 diff file_local.md5 file_provided.md5 Now the pipeline runs md5_check to validation the results and will quit if errors are found.
If the md5sum signature differs from that provided for the file:
- Check to be sure you have the correct file.
- Check if the md5sum was calculated on that compressed or uncompressed file by the provider and be sure to do the same with the local copy.
- Try the download again.
- Contact the sequence provider.
Indexing
The following indexing is required using BWA, Picard and SamTools. GATK requires all three. However this step only needs to be done once "per-machine".
- BWA
bwa index -a bwtsw human_g1k_v37_decoy.fasta
- Picard
java -jar CreateSequenceDictionary.jar R=human_g1k_v37_decoy.fasta O=human_g1k_v37_decoy.dic
- SamTools
samtools faidx human_g1k_v37_decoy.fasta
FastQ File Analyses
fastqc --threads # -o <output dir> -f fastq fastq-file.fq.gz 2> fastqc_run.log-#
Alignment
Align reads to the genome with bwa.
The 'BWA-mem' program will find the reference coordinates of the input reads (independent of their mate-pair). The following parameters are those used by the 1KG project and GATK for aligning Illumina data.
bwa mem -R "read group" human_g1k_v37_decoy.fasta Sample1_L1_R1.fq Sample1_L1_R2.fq | samblaster |sambamba view -f bam -l 0 -S /dev/stdin | sambamba sort -m 50G -o *_#_sorted_Dedup.bam
BAM File Analyses
- idxstats
samtools idxstats [sorted dedup bam files] > dedup-bamfile.stats
- flagstat
/scratch/ucgd/lustre/u0413537/software/samtools-0.1.19/
samtools flagstat _#_sorted_Dedup.bam > _#_sorted_Dedup.flagstat 2> flagstat.log-#
BAM Merging
- sambamba
Merging lane BAM files into single individual BAM file.
/scratch/ucgd/lustre/u0413537/software/sambamba/ sambamba merge --nthreads # individual_#_sorted_Dedup_merged.bam [individual_#_sorted_Dedup.bam files]
GATK Polishing Steps
- RealignerTargetCreator
This is where the tasks are broken into chromosomal regions.
java -jar -Xmx#g -XX:ParallelGCThreads=# -Djava.io.tmpdir=/tmp GenomeAnalysisTK.jar -T RealignerTargetCreator -R human_g1k_v37_decoy.fasta -I *sorted_Dedup_merged.bam --num_threads # --known Mills_and_1000G_gold_standard.indels.b37.vcf --known 1000G_phase1.indels.b37.vcf -L chr#_region_file.list -o *_chr#_realign.intervals 2> RealignerTargetCreator.log-#
- IndelRealigner
java -jar -Xmx#g -XX:ParallelGCThreads=# -Djava.io.tmpdir=/tmp GenomeAnalysisTK.jar -T IndelRealigner -R human_g1k_v37_decoy.fasta -I *_sorted_Dedup_merged.bam -L chr#_region_file.list -targetIntervals *_chr#_realign.intervals -known Mills_and_1000G_gold_standard.indels.b37.vcf -known 1000G_phase1.indels.b37.vcf -o *_sorted_Dedup_merged_realign_chr#.bam 2> IndelRealigner.log-1
- BaseRecalibrator
java -jar -Xmx#g -XX:ParallelGCThreads=# -Djava.io.tmpdir=/tmp GenomeAnalysisTK.jar -T BaseRecalibrator -R /human_g1k_v37_decoy.fasta -I *_sorted_Dedup_merged_realign_chr#.bam --num_cpu_threads_per_data_thread # --knownSites dbsnp_137.b37.vcf --knownSites Mills_and_1000G_gold_standard.indels.b37.vcf --knownSites 1000G_phase1.indels.b37.vcf -o *_sorted_Dedup_merged_realign_chr#_recal_data.table &> BaseRecalibrator.log-#
- PrintReads
java -jar -Xmx#g -XX:ParallelGCThreads=# -Djava.io.tmpdir=/tmp GenomeAnalysisTK.jar -T PrintReads -R human_g1k_v37_decoy.fasta -I *_sorted_Dedup_merged_realign_chr#.bam --num_cpu_threads_per_data_thread # -BQSR *_sorted_Dedup_merged_realign_chr#_recal_data.table -o *_sorted_Dedup_merged_realign_chr#_recal.bam &> PrintReads.log-#
GATK Variant Calling
- HaplotypeCaller
Now HaplotypeCaller handels SNP and INDEL calls.
java -jar -Xmx#g -XX:ParallelGCThreads=# -Djava.io.tmpdir=/tmp GenomeAnalysisTK.jar -T HaplotypeCaller -R human_g1k_v37_decoy.fasta --min_base_quality_score 20 --variant_index_parameter 128000 --emitRefConfidence GVCF --standard_min_confidence_threshold_for_calling 30.0 --num_cpu_threads_per_data_thread # --variant_index_type LINEAR --standard_min_confidence_threshold_for_emitting 30.0 -I *_sorted_Dedup_merged_realign_chr*_recal.bam -L chr#_region_file.list -o chr#_region_*.raw.snps.indels.gvcf &> HaplotypeCaller.log-#
- CatVariants
Collect all individual gvcf files. These will be the files kept for GenotypeGVCF and future GenotypeGVCF re-runs.
java -cp GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants -R human_g1k_v37_decoy.fasta --assumeSorted -V chr#_region_*.raw.snps.indels.vcf -V [ ... ] -out <individual>.raw.snps.indels.gCat.vcf &> CatVariants.log-*
- CombineGVCFs
This step will only be ran on > 200 individual gCat call sets.
java -jar -Xmx#g -XX:ParallelGCThreads=# GenomeAnalysisTK.jar -T CombineGVCFs -R human_g1k_v37_decoy.fasta --variant *.raw.snps.indels.gCat.vcf --variant [ ... ] -o cApTUrE_*_mergeGvcf.vcf &> CombineGVCF.log-*
- GenotypeGVCFs
java -jar -Xmx#g -XX:ParallelGCThreads=# GenomeAnalysisTK.jar -T GenotypeGVCFs -R human_g1k_v37_decoy.fasta --num_threads # --variant [cApTUrE_*_mergeGvcf.vcf or <individual>.raw.snps.indels.gCat.vcf] --variant CEU_mergeGvcf.vcf --variant GBR_mergeGvcf.vcf --variant FIN_mergeGvcf.vcf -L chr#_region_file.list -o chr#_region_genotyped.vcf &> GenotypeGVCF.log-#
- Combine_Genotyped
Collect all the individual genotype steps.
java -cp GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants -R human_g1k_v37_decoy.fasta --assumeSorted -V chr#_region_genotyped.vcf -V [ ... ] -out cApTUrE_*_cat_genotyped.vcf &> Combine_Genotyped.log-#
VariantRecalibrator
- SNP Recalibration
java -jar -Xmx#g -XX:ParallelGCThreads=# -Djava.io.tmpdir=tmp GenomeAnalysisTK.jar -T VariantRecalibrator -R human_g1k_v37_decoy.fasta --minNumBadVariants 5000 --num_threads # -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.vcf -resource:omni,known=false,training=true,truth=true,prior=12.0 1000G_omni2.5.b37.vcf -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.b37.vcf -an QD -an MQRankSum -an ReadPosRankSum -an FS -an InbreedingCoeff -input cApTUrE_*_cat_genotyped.vcf -recalFile cApTUrE_*_snp_recal -tranchesFile cApTUrE_*_snp_tranches -rscriptFile cApTUrE_*_snp_plots.R -mode SNP &> VariantRecalibrator_SNP.log-#
- INDEL Recalibration
java -jar -Xmx#g -XX:ParallelGCThreads=# -Djava.io.tmpdir=/tmp GenomeAnalysisTK.jar -T VariantRecalibrator -R human_g1k_v37_decoy.fasta --minNumBadVariants 5000 --num_threads # -resource:mills,known=false,training=true,truth=true,prior=12.0 Mills_and_1000G_gold_standard.indels.b37.vcf -resource:1000G,known=false,training=true,truth=true,prior=10.0 1000G_phase1.indels.b37.vcf -an MQRankSum -an ReadPosRankSum -an FS -input cApTUrE_*_cat_genotyped.vcf -recalFile cApTUrE_*_indel_recal -tranchesFile cApTUrE_*_indel_tranches -rscriptFile cApTUrE_*_indel_plots.R -mode INDEL &> VariantRecalibrator_INDEL.log-#
ApplyRecalibration
- SNP Apply
java -jar -Xmx#g -Djava.io.tmpdir=/tmp GenomeAnalysisTK.jar -T ApplyRecalibration -R human_g1k_v37_decoy.fasta --ts_filter_level 99.5 --excludeFiltered --num_threads # -input cApTUrE_*_cat_genotyped.vcf -recalFile cApTUrE_*_snp_recal -tranchesFile cApTUrE_*_snp_tranches -mode SNP -o cApTUrE_*_recal_SNP.vcf &> ApplyRecalibration_SNP.log-*
- INDEL Apply
java -jar -Xmx#g -Djava.io.tmpdir=/tmp GenomeAnalysisTK.jar -T ApplyRecalibration -R human_g1k_v37_decoy.fasta --ts_filter_level 99.0 --excludeFiltered --num_threads # -input cApTUrE_*_cat_genotyped.vcf -recalFile cApTUrE_*_indel_recal -tranchesFile cApTUrE_*_indel_tranches -mode INDEL -o cApTUrE_*_recal_INDEL.vcf &> ApplyRecalibration_INDEL.log-*
CombineVarients
These command will combine INDEL and SNP files into a single VCF file.
- CombineVarients
java -jar -Xmx#g -Djava.io.tmpdir=/tmp GenomeAnalysisTK.jar -T CombineVariants -R human_g1k_v37_decoy.fasta --num_threads # --genotypemergeoption UNSORTED --variant cApTUrE_*_recal_SNP.vcf --variant cApTUrE_*_recal_INDEL.vcf -o cApTUrE_*_Combined.vcf &> CombineVariants.log-*