Difference between revisions of "UGP Variant Pipeline 0.0.2"

From Utah Genome Project Wiki
Jump to navigation Jump to search
Line 95: Line 95:
 
The AddOrReplaceReadGroups tool will add the correct readgroup ID to the BAM files
 
The AddOrReplaceReadGroups tool will add the correct readgroup ID to the BAM files
  
  nohup java -Xmx4g -jar picard-tools/AddOrReplaceReadGroups.jar \
+
  java -Xmx4g -jar picard-tools/AddOrReplaceReadGroups.jar \
 
     INPUT=Sample1_L1.bam                                            \
 
     INPUT=Sample1_L1.bam                                            \
 
     OUTPUT=Sample1_L1.rg.bam                                        \
 
     OUTPUT=Sample1_L1.rg.bam                                        \
Line 129: Line 129:
 
Remove PCR/Optical duplicate reads
 
Remove PCR/Optical duplicate reads
  
  java -Xmx4g -jar MarkDuplicates.jar \
+
  java -Xmx4g -jar MarkDuplicates.jar   \
     INPUT=lane1.markdup.bam             \
+
     INPUT=Sample1.bam                   \
     OUTPUT=lane1_markdup                \
+
     OUTPUT=Sample1.dedup.bam            \
 
     METRICS_FILE=lane1_dup_metrics.txt  \
 
     METRICS_FILE=lane1_dup_metrics.txt  \
 
     ASSUME_SORTED=true                  \
 
     ASSUME_SORTED=true                  \
Line 141: Line 141:
 
=== Local Realignment of Indels ===
 
=== Local Realignment of Indels ===
  
  java -Xmx4g -jar /usr/local/GenomeAnalysisTK-2.4-7-g5e89f01//GenomeAnalysisTK.jar \
+
  java -Xmx4g -jar GenomeAnalysisTK.jar                 \
     -I all_bams.list                                                 \
+
     -I All_BAMs.list                                   \
     -T RealignerTargetCreator                                       \
+
    -o Realign_Intervals.txt                            \
     -R human_g1K_v37.fasta                                           \
+
     -T RealignerTargetCreator                           \
    -o lane_1.realign.intervals                        \
+
     -R human_g1K_v37.fasta                             \
     -known Mills_and_1000G_gold_standard.indels.b37.vcf                             \
+
     -known Mills_and_1000G_gold_standard.indels.b37.vcf \
     -known 1000G_phase1.indels.b37.vcf.gz                                          \
+
     -known 1000G_phase1.indels.b37.vcf                 \
     -nt 24                                                                         \
+
     -nt 24                                             \
    >  RealignerTargetCreator                                                      \
 
 
     2> RealignerTargetCreator.error
 
     2> RealignerTargetCreator.error
  
  java -Xmx4g -jar /usr/local/GenomeAnalysisTK-2.4-7-g5e89f01/GenomeAnalysisTK.jar \
+
  java -Xmx4g -jar GenomeAnalysisTK.jar                 \
     -T IndelRealigner                                                             \
+
     -T IndelRealigner                                  \
    -R fasta/hs_ref_GRCh37.p10_ALL_chr_hs37d5.fa                                   \
+
     -I Sample1.dedup.bam                               \
     -I lane1_markdup.bam                                                           \
+
     -o Sample1.realign.bam                              \
     -targetIntervals realign.intervals                                            \
+
     -R human_g1K_v37.fasta                              \
     -o 9958X_Realigned.bam                                                        \
+
     -targetIntervals Realign_Intervals.txt              \
     -known Mills_and_1000G_gold_standard.indels.b37.vcf                            \
+
     -known Mills_and_1000G_gold_standard.indels.b37.vcf \
     -known 1000G_phase1.indels.b37.vcf.gz                                          \
 
    >  IndelRealigner.out                                                          \
 
 
     2> IndelRealigner.error
 
     2> IndelRealigner.error
  
 
=== Base Quality Score Recalibration ===
 
=== Base Quality Score Recalibration ===
  
java -Xmx4g -jar GenomeAnalysisTK.jar \ -T PrintReads \ -R reference.fasta \ -I input.bam \ -BQSR recalibration_report.grp \ -o output.bam - See more at: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibration-bqsr#sthash.Gy8Grf3R.dpuf
+
java -Xmx4g -jar GenomeAnalysisTK.jar \
 
+
  -T PrintReads                       \
Sort and index recalibrated alignment (~5h)
+
  -I Sample1.realign.bam             \
 
+
   -o Sample1.recal.bam               \
java -Xmx4g -jar /tools/picard-tools-1.32/SortSam.jar \
+
  -R human_g1K_v37.fasta              \
    INPUT=/output/filename.b37_1kg.recal.bam           \
+
  -BQSR recalibration_report.grp      \
    OUTPUT=/output/filename.b37_1kg.recal.sorted.bam   \
 
    SORT_ORDER=coordinate                              \
 
    VALIDATION_STRINGENCY=LENIENT                      \
 
    TMP_DIR=/big/drive
 
 
 
java -Xmx4g -jar /tools/picard-tools-1.32/BuildBamIndex.jar \
 
    INPUT=/output/filename.b37_1kg.recal.sorted.bam           \
 
    OUTPUT=/output/filename.b37_1kg.recal.sorted.bam.bai      \
 
    VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/big/drive
 
 
 
Calculate covariates after realignment and recalibration
 
 
 
java -Xmx4g -jar GenomeAnalysisTK.jar                      \
 
    -l INFO                                                \
 
    -T CountCovariates                                      \
 
    -U ALLOW_UNINDEXED_BAM                                  \
 
    -R /resources//hg19/indices/b37_1kg.fa                  \
 
    --DBSNP /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \
 
    -I /output/filename.b37_1kg.recal.sorted.bam            \
 
    -cov ReadGroupcovariate                                \
 
    -cov QualityScoreCovariate                              \
 
    -cov CycleCovariate                                    \
 
    -cov DinucCovariate                                    \
 
    -recalFile /output/filename.b37_1kg.recal.covariate_table.csv
 
 
 
Analyze covariates before and after
 
 
 
java -Xmx4g -jar AnalyzeCovariates.jar                            \
 
    -l INFO                                                        \
 
    -resources /resources//hg19/indices/b37_1kg.fa                  \
 
    --recal_file /output/filename.b37_1kg.matefixed.covariate_table.csv \
 
    -outputDir /output/filename.b37_1kg.recal.stats_before/          \
 
    -Rscript ${rscript}
 
    -ignoreQ 5
 
 
 
java -Xmx4g -jar AnalyzeCovariates.jar                          \
 
    -l INFO                                                      \
 
    -resources /resources//hg19/indices/b37_1kg.fa              \
 
    --recal_file /output/filename.b37_1kg.recal.covariate_table.csv \
 
    -outputDir /output/filename.b37_1kg.recal.stats_after/        \
 
    -Rscript ${rscript}                                          \
 
    -ignoreQ 5
 
 
 
=== Generate the SAM MD Tag ===
 
 
 
The MD tag in SAM files provides a string representation of mismatching positions.  The CIGAR string used in SAM files does not provide full detail of mismatch positions.  The samtools calmd command is used to generate MD tags as well as which fix the NM tags (Edit distance to the reference) and introduce the BQ tags (Base Alignment Quality) which can be used during variant calling.
 
 
 
samtools calmd -Erb recalibrated_file.bam $reference.fasta > $bq_file.bam
 
 
 
=== Strip Extraneous BAM Tags ===
 
 
 
Run-level BAMs have extraneous tags (OQ, XM, XG, XO) stripped from them, to reduce total file size by around 30%.
 
 
 
GATK ReduceReads ???
 
 
 
=== Merge BAMS ===
 
 
 
Merge all BAMS for a given individual
 
 
 
java $jvm_args -jar MergeSamFiles.jar INPUT=$tag_stripped_bam_file(s) OUTPUT=$library_level_bam VALIDATION_STRINGENCY=LENIENT
 
 
 
=== Merge All BAMS for the Project ===
 
 
 
This happens above... Picard MergeSamFiles is used to Merge all the SAM files for a given project.
 
 
 
java $jvm_args -jar MergeSamFiles.jar INPUT=$markdup_bam_file(s) OUTPUT=$platform_level_bam VALIDATION_STRINGENCY=LENIENT
 
  
 
=== BAM Quality Control ===
 
=== BAM Quality Control ===
Line 260: Line 191:
  
 
CollectAlignmentSummaryMetrics
 
CollectAlignmentSummaryMetrics
  java -jar -Xmx4g /tools/picard-tools-1.32/CollectAlignmentSummaryMetrics.jar \
+
  java -Xmx4g -jar /tools/picard-tools-1.32/CollectAlignmentSummaryMetrics.jar \
 
     I=/output/filename.b37_1kg.sorted.bam                                      \
 
     I=/output/filename.b37_1kg.sorted.bam                                      \
 
     O=/output/filename.b37_1kg.AlignmentSummaryMetrics                        \
 
     O=/output/filename.b37_1kg.AlignmentSummaryMetrics                        \
     R=/resources//hg19/indices/b37_1kg.fa                                     \
+
     R=human_g1K_v37.fasta                                     \
 
     VALIDATION_STRINGENCY=LENIENT                                            \
 
     VALIDATION_STRINGENCY=LENIENT                                            \
 
     TMP_DIR=/big/drive
 
     TMP_DIR=/big/drive
  
 
CollectGcBiasMetrics
 
CollectGcBiasMetrics
  java -jar /tools/picard-tools-1.32/CollectGcBiasMetrics.jar \
+
  java -Xmx4g -jar /tools/picard-tools-1.32/CollectGcBiasMetrics.jar \
     R=/resources//hg19/indices/b37_1kg.fa                   \
+
     R=human_g1K_v37.fasta                   \
 
     I=/output/filename.b37_1kg.sorted.bam                    \
 
     I=/output/filename.b37_1kg.sorted.bam                    \
 
     O=/output/filename.b37_1kg.GcBiasMetrics                  \
 
     O=/output/filename.b37_1kg.GcBiasMetrics                  \
Line 277: Line 208:
  
 
CollectInsertSizeMetrics
 
CollectInsertSizeMetrics
  java -jar /tools/picard-tools-1.32/CollectInsertSizeMetrics.jar \
+
  java -Xmx4g -jar /tools/picard-tools-1.32/CollectInsertSizeMetrics.jar \
 
     I=/output/filename.b37_1kg.sorted.bam                        \
 
     I=/output/filename.b37_1kg.sorted.bam                        \
 
     O=/output/filename.b37_1kg.CollectInsertSizeMetrics          \
 
     O=/output/filename.b37_1kg.CollectInsertSizeMetrics          \
Line 285: Line 216:
  
 
MeanQualityByCycle
 
MeanQualityByCycle
  java -jar /tools/picard-tools-1.32/MeanQualityByCycle.jar \
+
  java -Xmx4g -jar /tools/picard-tools-1.32/MeanQualityByCycle.jar \
 
     I=/output/filename.b37_1kg.sorted.bam                  \
 
     I=/output/filename.b37_1kg.sorted.bam                  \
 
     O=/output/filename.b37_1kg.MeanQualityByCycle          \
 
     O=/output/filename.b37_1kg.MeanQualityByCycle          \
Line 293: Line 224:
  
 
QualityScoreDistribution
 
QualityScoreDistribution
  java -jar /tools/picard-tools-1.32/QualityScoreDistribution.jar      \
+
  java -Xmx4g -jar /tools/picard-tools-1.32/QualityScoreDistribution.jar      \
 
     I=/output/filename.b37_1kg.sorted.bam                                \
 
     I=/output/filename.b37_1kg.sorted.bam                                \
 
     O=/output/filename.b37_1kg.[wiki:QualityScoreDistribution]            \
 
     O=/output/filename.b37_1kg.[wiki:QualityScoreDistribution]            \
Line 301: Line 232:
  
 
BamIndexStats
 
BamIndexStats
  java -jar /tools/picard-tools-1.32/BamIndexStats.jar \
+
  java -Xmx4g -jar /tools/picard-tools-1.32/BamIndexStats.jar \
 
     INPUT=/output/filename.b37_1kg.sorted.bam          \
 
     INPUT=/output/filename.b37_1kg.sorted.bam          \
 
     VALIDATION_STRINGENCY=LENIENT                    \
 
     VALIDATION_STRINGENCY=LENIENT                    \
Line 307: Line 238:
  
 
CalculateHsMetricsWholeGenome
 
CalculateHsMetricsWholeGenome
  java -jar -Xmx3g /tools/picard-tools-1.32/CalculateHsMetricsWholeGenome.jar \
+
  java -Xmx4g -jar /tools/picard-tools-1.32/CalculateHsMetricsWholeGenome.jar \
 
     INPUT=/output/filename.b37_1kg.sorted.bam                                \
 
     INPUT=/output/filename.b37_1kg.sorted.bam                                \
 
     OUTPUT=/output/filename.b37_1kg.HsMetrics                                \
 
     OUTPUT=/output/filename.b37_1kg.HsMetrics                                \
Line 315: Line 246:
 
     TMP_DIR=/big/drive
 
     TMP_DIR=/big/drive
  
=== Split BAMS by Chromosome ===
+
=== Reduce Reads ===
  
BAMs are split into chromosomes BAMs. These files move on to variant calling.
+
java -Xmx4g -jar GenomeAnalysisTK.jar \
 
+
  -T ReduceReads                      \
=== Compress BAMS with ReduceRead ===
+
  -I Sample1.recal.bam                \
 +
  -o Sample1.reduced.bam              \
 +
  -R human_g1K_v37.fasta              \
 +
  2> ReduceReads.error
  
 
== Variant Calling ==
 
== Variant Calling ==
  
 
=== UnifiedGenotyper ===
 
=== UnifiedGenotyper ===
  java -jar GenomeAnalysisTK.jar \
+
  java -Xmx4g -jar GenomeAnalysisTK.jar \
   -R resources/Homo_sapiens_assembly18.fasta \
+
   -T UnifiedGenotyper                  \
   -T UnifiedGenotyper \
+
   -I Sample1.reduced.bam              \
   -I sample1.bam [-I sample2.bam ...] \
+
   -I Sample2.reduced.bam             \
   --dbsnp dbSNP.vcf \
+
  -I Background1.reduced.bam         \
   -o snps.raw.vcf \
+
   -I Background2.reduced.bam          \
   -stand_call_conf [50.0] \
+
   -o Sample1.raw.vcf                 \
   -stand_emit_conf 10.0 \
+
   -R human_g1K_v37.fasta              \
   -dcov [50 for 4x, 200 for >30x WGS or Whole exome] \
+
   --dbsnp dbSNP.vcf                    \
   [-L targets.interval_list]
+
   -dcov 250                          \
+
   -L TruSeq_exome_regions.bed        \
 
+
  2> UnifiedGenotyper.error
  
 
=== VariantRecalibrator ===
 
=== VariantRecalibrator ===
 
  java -Xmx4g -jar GenomeAnalysisTK.jar \
 
  java -Xmx4g -jar GenomeAnalysisTK.jar \
   -T VariantRecalibrator \
+
   -T VariantRecalibrator             \
   -R reference/human_g1k_v37.fasta \
+
   -R human_g1K_v37.fasta             \
   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
+
   -input Sample1.raw.vcf             \
   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+
   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.vcf         \
   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+
   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.vcf       \
   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
+
   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_137.b37.vcf           \
 
   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
 
   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
   -mode SNP \
+
   -mode SNP                           \
   -recalFile path/to/output.recal \
+
   -recalFile Sample1.recal           \
   -tranchesFile path/to/output.tranches \
+
   -tranchesFile Sample1.tranches     \
   -rscriptFile path/to/output.plots.R
+
   -rscriptFile Sample1.plots.R       \
 +
  2> VariantRecalibrator.error
 
   
 
   
 
=== ApplyRecalibration ===
 
=== ApplyRecalibration ===
  java -Xmx3g -jar GenomeAnalysisTK.jar \
+
  java -Xmx4g -jar GenomeAnalysisTK.jar \
   -T ApplyRecalibration \
+
   -T ApplyRecalibration               \
   -R reference/human_g1k_v37.fasta \
+
   -input Sample1.raw.vcf              \
   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
+
   -o Sample1.vqsr.vcf                \
   --ts_filter_level 99.0 \
+
  -R human_g1K_v37.fasta              \
   -tranchesFile path/to/output.tranches \
+
   --ts_filter_level 99.0               \
   -recalFile path/to/output.recal \
+
   -tranchesFile Sample1.tranches     \
   -mode SNP \
+
   -recalFile Sample1.recal           \
   -o path/to/output.recalibrated.filtered.vcf
+
   -mode SNP                           \
 
+
   2> ApplyRecalibration.error
== Variant File Analyses ==
 
 
 
== Family Based Analyses ==
 
 
 
=== IBD/SGS ===
 
 
 
=== Linkage ===
 
 
 
== Population Based Analyses ==
 
 
 
=== Phasing ===
 
 
 
=== FST ===
 
 
 
== Phenotype Based Analyses ==
 
  
=== GWAS ===
+
== Variant File QC ==
  
==== VAAST ====
+
Ti/Tv Ratio
 +
HapMap concordance
 +
SNV/Indel Counts

Revision as of 17:29, 6 September 2013

Utah Genome Project

Variant Calling Pipeline 0.0.2

Sept. 2013

Data Source

Data sets used for the variant calling pipeline come from the Broad GSA (GATK) group as the 'GATK resource bundle 2.5' version 2.5

wget -r ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.5/b37
  • Reference Genome (GRCh37):
human_g1K_v37.fasta
  • High confidence SNVs
 1000G_omni2.5.b37.vcf
 hapmap_3.3.b37.vcf
  • dbSNP
 dbsnp_137.b37.vcf.gz
  • High confidence indels
 Mills_and_1000G_gold_standard.indels.b37.vcf.gz

Sequencing

This pipeline is designed for 100 bp Illumina HiSeq PE exome or WGS sequence data with Sanger/Illumina 1.9 quality encoding.

Validate File Integrity with md5sum

An md5sum signature should be provided for each FastQ file by the sequencing center. After the file has been downloaded, locally check the md5sum to be sure that no data corruption occurred during the file transfer.

md5sum file.fastq > file_local.md5
diff file_local.md5 file_provided.md5

If the md5sum signature differs from that provided for the file:

  • Check to be sure you have the correct file.
  • Check if the md5sum was calculated on that compressed or uncompressed file by the provider and be sure to do the same with the local copy.
  • Try the download again.
  • Contact the sequence provider.

FastQ File Analyses

fastqc Sample1_L1_R1.txt

From the fastqc_data.txt file we check the following values:

  • Encoding (must be Sanger / Illumina 1.9)
  • Total Sequences (Need to develop a acceptable range)
  • Filtered Sequences (Should be 0 or at least very low)
  • Sequence length (must be ~ 100 bp)
  • %GC (should be 45 < x < 55)
  • Total Duplicate Percentage (This value may not be valuable - an acceptable range has not been determined).

Alignment

Index the reference sequence if this has not already been done

bwa index -a bwtsw human_g1K_v37.fasta

The 'bwa aln' program will find the reference coordinates of the input reads (independent of their mate-pair). The following parameters are those used by the 1KG project for aligning Illumina data.

bwa aln -q 15 human_g1K_v37.fasta file.fastq > Sample1_L1_R1.sai # One lane of reads first in pair
bwa aln -q 15 human_g1K_v37.fasta file.fastq > Sample1_L1_R2.sai # One lane of reads second in pair

The 'bwa sampe'. For paired-end reads, the maximum insert size is taken to be 3 times the expected insert size.

bwa sampe -P human_g1K_v37.fasta Sample1_L1_R1.sai Sample1_L1_R1.sai Sample1_L1_R1.fastq Sample1_L1_R2.fastq | samtools view -bt human_g1K_v37.fasta.fai -o Sample1_L1 -

This will switch to bwa mem soon.

bwa mem human_g1K_v37.fasta Sample1_L1_R1.fq Sample1_L1_R2.fq | samtools view -bt human_g1K_v37.fasta.fai -o Sample1_L1 -

Bwa Command Line Options:

  • -n NUM max #diff (int) or missing prob under 0.02 err rate (float) [0.04]
  • -o INT maximum number or fraction of gap opens [1]
  • -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]
  • -i INT do not put an indel within INT bp towards the ends [5]
  • -d INT maximum occurrences for extending a long deletion [10]
  • -l INT seed length [32]
  • -k INT maximum differences in the seed [2]
  • -m INT maximum entries in the queue [2000000]
  • -t INT number of threads [1]
  • -M INT mismatch penalty [3]
  • -O INT gap open penalty [11]
  • -E INT gap extension penalty [4]
  • -R INT stop searching when there are >INT equally best hits [30]
  • -q INT quality threshold for read trimming down to 35bp [0]
  • -c input sequences are in the color space
  • -L log-scaled gap penalty for long deletions
  • -N non-iterative mode: search for all n-difference hits (slooow)
  • -f FILE file to write output to instead of stdout

The AddOrReplaceReadGroups tool will add the correct readgroup ID to the BAM files

java -Xmx4g -jar picard-tools/AddOrReplaceReadGroups.jar \
   INPUT=Sample1_L1.bam                                             \
   OUTPUT=Sample1_L1.rg.bam                                         \
   CREATE_INDEX=TRUE                                                \
   VALIDATION_STRINGENCY=LENIENT                                    \
   SORT_ORDER=coordinate                                            \
   RGID=9958X1                                                      \
   RGLB=9958X1                                                      \
   RGPL=illumina                                                    \
   RGPU=1                                                           \
   TMP_DIR=/big/drive                                               \
   2> 9958X1_AddOrReplaceReadGroups.error

BAM File Analyses

Alignment BAM files are improved in various ways to help increase the quality and speed of subsequent variant calling steps.

Merge lane level BAMs to individual

java -Xmx4g -jar picard-tools/MergeSamFiles.jar \
    INPUT=Sample1_L1.rg.bam                     \
    INPUT=Sample1_L2.rg.bam                     \                                                                                                 
    OUTPUT=Sample1.bam                          \
    CREATE_INDEX=TRUE                           \
    ASSUME_SORTED=true                          \
    USE_THREADING=true                          \
    VALIDATION_STRINGENCY=SILENT                \
    TMP_DIR=/big/drive                          \
    2> MergeSameFiles.error

Mark Duplicates

Remove PCR/Optical duplicate reads

java -Xmx4g -jar MarkDuplicates.jar    \
   INPUT=Sample1.bam                   \
   OUTPUT=Sample1.dedup.bam            \
   METRICS_FILE=lane1_dup_metrics.txt  \
   ASSUME_SORTED=true                  \
   USE_THREADING=true                  \
   VALIDATION_STRINGENCY=SILENT        \
   TMP_DIR=/big/drive                  \
   2> MarkDuplicates.error

Local Realignment of Indels

java -Xmx4g -jar GenomeAnalysisTK.jar                  \
   -I All_BAMs.list                                    \
   -o Realign_Intervals.txt                            \
   -T RealignerTargetCreator                           \
   -R human_g1K_v37.fasta                              \
   -known Mills_and_1000G_gold_standard.indels.b37.vcf \
   -known 1000G_phase1.indels.b37.vcf                  \
   -nt 24                                              \
   2> RealignerTargetCreator.error
java -Xmx4g -jar GenomeAnalysisTK.jar                  \
   -T IndelRealigner                                   \
   -I Sample1.dedup.bam                                \
   -o Sample1.realign.bam                              \
   -R human_g1K_v37.fasta                              \
   -targetIntervals Realign_Intervals.txt              \
   -known Mills_and_1000G_gold_standard.indels.b37.vcf \
   2> IndelRealigner.error

Base Quality Score Recalibration

java -Xmx4g -jar GenomeAnalysisTK.jar \
  -T PrintReads                       \
  -I Sample1.realign.bam              \
  -o Sample1.recal.bam                \
  -R human_g1K_v37.fasta              \
  -BQSR recalibration_report.grp      \

BAM Quality Control

  • Picard Tools
    • BamIndexStats
    • CalculateHsMetrics ??
    • CleanSam
    • CollectAlignmentSummaryMetrics
    • CollectGcBiasMetrics
    • CollectInsertSizeMetrics
    • CollectMultipleMetrics
    • CollectTargetedPcrMetrics
    • EstimateLibraryComplexity
    • FixMateInformation
    • MarkDuplicates
    • MeanQualityByCycle
    • QualityScoreDistribution
    • ValidateSamFile
  • GATK QC
  • [1]
  • [2]

CollectAlignmentSummaryMetrics

java -Xmx4g -jar /tools/picard-tools-1.32/CollectAlignmentSummaryMetrics.jar \
   I=/output/filename.b37_1kg.sorted.bam                                      \
   O=/output/filename.b37_1kg.AlignmentSummaryMetrics                         \
   R=human_g1K_v37.fasta                                     \
   VALIDATION_STRINGENCY=LENIENT                                             \
   TMP_DIR=/big/drive

CollectGcBiasMetrics

java -Xmx4g -jar /tools/picard-tools-1.32/CollectGcBiasMetrics.jar \
   R=human_g1K_v37.fasta                    \
   I=/output/filename.b37_1kg.sorted.bam                     \
   O=/output/filename.b37_1kg.GcBiasMetrics                  \
   CHART=/output/filename.b37_1kg.GcBiasMetrics.pdf          \
   VALIDATION_STRINGENCY=LENIENT                            \
   TMP_DIR=/big/drive

CollectInsertSizeMetrics

java -Xmx4g -jar /tools/picard-tools-1.32/CollectInsertSizeMetrics.jar \
   I=/output/filename.b37_1kg.sorted.bam                         \
   O=/output/filename.b37_1kg.CollectInsertSizeMetrics           \
   H=/output/filename.b37_1kg.CollectInsertSizeMetrics.pdf       \
   VALIDATION_STRINGENCY=LENIENT                                \
   TMP_DIR=/big/drive

MeanQualityByCycle

java -Xmx4g -jar /tools/picard-tools-1.32/MeanQualityByCycle.jar \
   I=/output/filename.b37_1kg.sorted.bam                   \
   O=/output/filename.b37_1kg.MeanQualityByCycle           \
   CHART=/output/filename.b37_1kg.MeanQualityByCycle.pdf   \
   VALIDATION_STRINGENCY=LENIENT                          \
   TMP_DIR=/big/drive

QualityScoreDistribution

java -Xmx4g -jar /tools/picard-tools-1.32/QualityScoreDistribution.jar       \
   I=/output/filename.b37_1kg.sorted.bam                                 \
   O=/output/filename.b37_1kg.[wiki:QualityScoreDistribution]            \
   CHART=/output/filename.b37_1kg.[wiki:QualityScoreDistribution].pdf    \
   VALIDATION_STRINGENCY=LENIENT                                        \
   TMP_DIR=/big/drive

BamIndexStats

java -Xmx4g -jar /tools/picard-tools-1.32/BamIndexStats.jar \
   INPUT=/output/filename.b37_1kg.sorted.bam          \
   VALIDATION_STRINGENCY=LENIENT                     \
   TMP_DIR=/big/drive

CalculateHsMetricsWholeGenome

java -Xmx4g -jar /tools/picard-tools-1.32/CalculateHsMetricsWholeGenome.jar \
   INPUT=/output/filename.b37_1kg.sorted.bam                                 \
   OUTPUT=/output/filename.b37_1kg.HsMetrics                                 \
   BAIT_INTERVALS=/resources//hg19/intervals/GoNL.interval_list             \
   TARGET_INTERVALS=/resources//hg19/intervals/GoNL.interval_list           \
   VALIDATION_STRINGENCY=LENIENT                                            \
   TMP_DIR=/big/drive

Reduce Reads

java -Xmx4g -jar GenomeAnalysisTK.jar \
  -T ReduceReads                      \
  -I Sample1.recal.bam                \
  -o Sample1.reduced.bam              \
  -R human_g1K_v37.fasta              \
  2> ReduceReads.error

Variant Calling

UnifiedGenotyper

java -Xmx4g -jar GenomeAnalysisTK.jar \
  -T UnifiedGenotyper                  \
  -I Sample1.reduced.bam              \
  -I Sample2.reduced.bam              \
  -I Background1.reduced.bam          \
  -I Background2.reduced.bam          \
  -o Sample1.raw.vcf                  \
  -R human_g1K_v37.fasta              \
  --dbsnp dbSNP.vcf                    \
  -dcov 250                           \
  -L TruSeq_exome_regions.bed         \
  2> UnifiedGenotyper.error

VariantRecalibrator

java -Xmx4g -jar GenomeAnalysisTK.jar \
  -T VariantRecalibrator              \
  -R human_g1K_v37.fasta              \
  -input Sample1.raw.vcf              \
  -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.vcf          \
  -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.vcf        \
  -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_137.b37.vcf            \
  -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
  -mode SNP                           \
  -recalFile Sample1.recal            \
  -tranchesFile Sample1.tranches      \
  -rscriptFile Sample1.plots.R        \
  2> VariantRecalibrator.error

ApplyRecalibration

java -Xmx4g -jar GenomeAnalysisTK.jar \
  -T ApplyRecalibration               \
  -input Sample1.raw.vcf              \
  -o Sample1.vqsr.vcf                 \
  -R human_g1K_v37.fasta              \
  --ts_filter_level 99.0                \
  -tranchesFile Sample1.tranches      \
  -recalFile Sample1.recal            \
  -mode SNP                           \
  2> ApplyRecalibration.error

Variant File QC

Ti/Tv Ratio
HapMap concordance
SNV/Indel Counts