#!/usr/bin/perl use strict; use warnings; use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; #use IO::Compress::Gzip qw(gzip $GzipError) ; ##### ##### ##### ##### ##### use Getopt::Std; use vars qw( $opt_a $opt_m $opt_n $opt_o $opt_v); # Usage my $usage = " script - does something useful. by Brian J. Knaus October 2009 Copyright (c) 2009 Brian J. Knaus. License is hereby granted for personal, academic, and non-profit use. Commercial users should contact the author (http://brianknaus.com). Great effort has been taken to make this software perform its said task however, this software comes with ABSOLUTELY NO WARRANTY, not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Usage: perl script.pl options required: -a fastq input file. optional: -m outfile base name [default is nuccomp*]. -n is the file gzipped [T/F, default is T]. -o summarize qualities [T/F, default is T]. -v verbose mode [T/F, default is F]. Version 0.1-1. "; # command line processing. getopts('a:m:n:o:v:'); die $usage unless ($opt_a); my ($inf, $ofile, $gz, $qs, $verb); $inf = $opt_a if $opt_a; $ofile = $opt_m ? $opt_m : "nuccomp"; $gz = $opt_n ? $opt_n : "T"; $qs = $opt_o ? $opt_o : "T"; $verb = $opt_v ? $opt_v : "F"; ##### ##### ##### ##### ##### # Globals. my ($temp, $i, $nreads); my (@nucs, @temp); my %nuch; my $j; my ($in, $outf, $out); my (@quals, $qual); ##### ##### ##### ##### ##### # Main. if($gz eq 'T'){ $in = new IO::Uncompress::Gunzip $inf or die "IO::Uncompress::Gunzip failed: $GunzipError\n"; } else { open($in, "<", $inf) or die "Can't open $inf: $!"; } chomp($temp[0] = <$in>); # First line id an id. chomp($temp[1] = <$in>); # Second line is a sequence. chomp($temp[2] = <$in>); # Third line is an id. chomp($temp[3] = <$in>); # Fourth line is quality. # Initialize array of hashes. @temp = split(//, $temp[1]); # Get read length. for $i (0..$#temp){ $nucs[$i]{A} = 0; $nucs[$i]{C} = 0; $nucs[$i]{G} = 0; $nucs[$i]{N} = 0; $nucs[$i]{T} = 0; $quals[$i] = 0; } if($gz eq 'T'){ close $in or die "$in: $GunzipError\n"; } else { close $in or die "$in: $!"; } # Process data. if($gz eq 'T'){ $in = new IO::Uncompress::Gunzip $inf or die "IO::Uncompress::Gunzip failed: $GunzipError\n"; } else { open($in, "<", $inf) or die "Can't open $inf: $!"; } $nreads = 0; while (<$in>){ chomp($temp[0] = $_); # First line id an id. chomp($temp[1] = <$in>); # Second line is a sequence. chomp($temp[2] = <$in>); # Third line is an id. chomp($temp[3] = <$in>); # Fourth line is quality. $qual = $temp[3]; $temp[1] = uc($temp[1]); @temp = split(//, $temp[1]); for ($i=0; $i <= $#temp; $i++){ $nucs[$i]{$temp[$i]} = $nucs[$i]{$temp[$i]} + 1; } if($qs eq 'T'){ proc_qual(\@quals, $qual); } $nreads++; } if($gz eq 'T'){ close $in or die "$in: $GunzipError\n"; } else { close $in or die "$in: $!"; } # Convert to strings. my $adn = $nucs[0]{A}; for $i (1..$#temp){ $adn = "$adn,$nucs[$i]{A}"; } my $thy = $nucs[0]{T}; for $i (1..$#temp){ $thy = "$thy,$nucs[$i]{T}"; } my $gua = $nucs[0]{G}; for $i (1..$#temp){ $gua = "$gua,$nucs[$i]{G}"; } my $cyt = $nucs[0]{C}; for $i (1..$#temp){ $cyt = "$cyt,$nucs[$i]{C}"; } my $ns = $nucs[0]{N}; for $i (1..$#temp){ $ns = "$ns,$nucs[$i]{N}"; } # Print to outfile. $outf = $ofile."_bases.txt"; open($out, ">", $outf) or die "Can't open $outf: $!"; print $out "$nreads total reads. A, T, G, C, N, qualities (optional).\n"; print $out "$adn\n"; print $out "$thy\n"; print $out "$gua\n"; print $out "$cyt\n"; print $out "$ns\n"; if($qs eq 'T'){ print $out join(",",@quals),"\n"; } close $out or die "$out: $!"; nuccomp_r($ofile); if($qs eq 'T'){ qbar($ofile); } ##### ##### ##### ##### ##### # Subfunctions. sub proc_qual { my $qar = shift; my $quals = shift; my @temp = split(//, $qual); my $i; for($i=0;$i<=$#temp;$i++){ ${$qar}[$i] = ${$qar}[$i] + ord($temp[$i])-33; } } sub nuccomp_r{ my $outf = shift; my $inf = $outf."_bases.txt"; my $rout = $outf."_nucplot.png"; $outf = $outf."_nuc.r"; my $out; open($out, ">", $outf) or die "Can't open $outf: $!"; print $out "x <- read.table('$inf', sep=',', skip=1, nrows=5) xsums <- colSums(x) # Convert to percentages. for (i in 1:nrow(x)){ x[i,] <- x[i,]/xsums } x <- as.matrix(x) png('$rout') barplot(x, space=0, names.arg=1:ncol(x), col=c('green', 'red', 'black', 'blue', 'grey'), xlab='Cycle',ylab='Nucleotide frequency') # mtext(c('A','T','G','C','N'), side=4, at=c(0.125, 0.375, 0.625, 0.875, 0.98), col=c('green', 'red', 'black', 'blue', 'grey'), las=1 ) dev.off()"; close $out or die "$out: $!"; my $cmd = "R CMD BATCH $outf"; system($cmd); if (-e $outf){unlink $outf;} if (-e $outf.".Rout"){unlink $outf.".Rout";} } sub qbar{ my $outf = shift; my $inf = $outf."_bases.txt"; my $rout = $outf."_qbarplot.png"; $outf = $outf."_qbar.r"; my $out; open($out, ">", $outf) or die "Can't open $outf: $!"; print $out "reads <- scan('$inf',n=1) # x <- scan('$inf', sep=',',skip=6, nlines=1) # Convert to averages. x <- x/reads png('$rout') barplot(x, space=0, names.arg=1:length(x), xlab='Cycle',ylab='Mean quality', ylim=c(0,40) ) abline(h=10) abline(h=20) abline(h=30) abline(h=40) # dev.off()"; close $out or die "$out: $!"; my $cmd = "R CMD BATCH $outf"; system($cmd); if (-e $outf){unlink $outf;} if (-e $outf.".Rout"){unlink $outf.".Rout";} } ##### ##### ##### ##### ##### # EOF.