#File: MotifUtils.pm #Project: generic #Author: James Johnson (though used parts of exising scripts) #Created: June 2010 (though the repackaged scripts are much older) package MotifUtils; use strict; use warnings; require Exporter; our @ISA = qw(Exporter); our @EXPORT_OK = qw(seq_to_intern matrix_to_intern intern_to_iupac intern_to_meme intern_to_pspm read_background_file meme_header motif_id motif_width parse_double round); use POSIX qw(strtod); ############################################################################## # Internal format description ############################################################################## # # The internal format for the motif is as follows: # # hash { # bg => hash { # => scalar (background prior probability of residue 1) # => scalar (background prior probability of residue 2) # ... # => scalar (background prior probability of residue N) # dna => scalar (true if the bg is dna) # source => scalar (source description of bg) # } # strands => scalar (number of strands used in a dna motif) # id => scalar (name of motif) # alt => scalar (alternate name of motif) # url => scalar (url of motif) # width => scalar (width of the motif) # sites => scalar (number of sites that made up motif) # pseudo => scalar (total pseudocount added) # evalue => scalar (evalue of motif) # pspm => hash { # => array [ # scalar (probability of residue 1 at position 1) # scalar (probability of residue 1 at position 2) # ... # scalar (probability of residue 1 at position M) # ] # => array [ # scalar (probability of residue 2 at position 1) # scalar (probability of residue 2 at position 2) # ... # scalar (probability of residue 2 at position M) # ] # } # pssm (optional for meme files that have a better pssm) => hash { # => array [ # scalar (probability of residue 1 at position 1) # scalar (probability of residue 1 at position 2) # ... # scalar (probability of residue 1 at position M) # ] # => array [ # scalar (probability of residue 2 at position 1) # scalar (probability of residue 2 at position 2) # ... # scalar (probability of residue 2 at position M) # ] # } # } # # # # ############################################################################## # lookup tables ############################################################################## my @dna_alphabet = qw(A C G T); my @dna_iupac = qw(A C M G R S V T W Y H K D B N); #ordered by bit value my %dna_bits = ('A' => 1, 'C' => 2, 'G' => 4, 'T' => 8, 'U' => 8, 'R' => 5, 'Y' => 10, 'M' => 3, 'K' => 12, 'W' => 9, 'S' => 6, 'B' => 14, 'D' => 13, 'H' => 11, 'V' => 7, 'N' => 15); my $dna_ic = 2; my @protein_alphabet = qw(A C D E F G H I K L M N P Q R S T V W Y); my @protein_iupac = qw(A C D E F G H I K L M N P Q R S T V W Y B Z X); my %protein_bits = ('A' => 1, 'C' => 2, 'D' => 4, 'E' => 8, 'F' => 16, 'G' => 32, 'H' => 64, 'I' => 128, 'K' => 256, 'L' => 512, 'M' => 1024, 'N' => 2048, 'P' => 4096, 'Q' => 8192, 'R' => 16384, 'S' => 32768, 'T' => 65536, 'V' => 131072, 'W' => 262144, 'Y' => 524288, 'B' => 2052, # D or N 'Z' => 8200, # E or Q 'X' => 1048575 # any ); my %protein_lookup = (1 => 'A', 2 => 'C', 4 => 'D', 8 => 'E', 16 => 'F', 32 => 'G', 64 => 'H', 128 => 'I', 256 => 'K', 512 => 'L', 1024 => 'M', 2048 => 'N', 4097 => 'P', 8192 => 'Q', 16284 => 'R', 32768 => 'S', 65536 => 'T', 131072 => 'V', 262144 => 'W', 524288 => 'Y', 2052 => 'B', 8200 => 'Z', 1048575 => 'X'); my $protein_ic = log(20) / log(2); my $LN2 = log(2); ############################################################################## # public functions ############################################################################## # # Takes one or more iupac sequences with posssible bracket expressions # and constructs a probability matrix in this modules internal format. # # Usage: # seq_to_intern( # \%background, # $sequence_samples, # $sites_per_sample, # $pseudo_total, # id => $optional_id, # alt => $optional_alt, # strands => $optional_strands, # url => $optional_url # ); # # Parameters # bg - a reference to the background model also defines if the motif is dna or protein # sequence samples - one or more iupac sequences seperated by newlines. # sites per sample - the number of sites attributed to each sample; # pseudo total - a pseudocount that is distributed according to the background model # # Options # id - use value for motif name # alt - use value for motif alternate name # strands - only relevant to dna; the strands used to create the motif # url - use value for motif information website # sub seq_to_intern { my ($bg, $seq_lines, $sites_per_sample, $pseudo_total, %opt) = @_; my ($id, $alt, $strands, $url) = ($opt{id}, $opt{alt}, $opt{strands}, $opt{url}); $strands = ($bg->{dna} ? 2 : 1) unless defined $strands; $alt = '' unless defined($alt); # split the sequence lines into many sequences my @seqs = split(/\n/, $seq_lines); #lookups my %nums = ($bg->{dna} ? %dna_bits : %protein_bits); my @residues = ($bg->{dna} ? @dna_alphabet : @protein_alphabet); my $alpha = join('', keys(%nums)); #keep track of errors related to parsing seq my @errors = (); #keep track of counts from multple samples my @matrix = (); my $name = ''; # get the dimensions my $asize = scalar(@residues); my $width = 0; my $sample_count = 0; for (my $line = 0; $line < scalar(@seqs); $line++) { my ($bitfields, $convert_errors) = seq_to_bitfield($seqs[$line], $alpha, \%nums, $line + 1); next if (scalar(@{$bitfields}) == 0 && !@{$convert_errors}); #skip empty lines push(@errors, @{$convert_errors}); if ($width == 0) { $width = scalar(@{$bitfields}); if ($width > 0) { #usable line found #initilize matrix to all zeros my $matrix_size = $asize * $width; for (my $i = 0; $i < $matrix_size; $i++) { $matrix[$i] = 0; } #create a name (more accurate than intern_to_iupac for protein) $name = bitfield_to_seq($bg->{dna}, $bitfields) unless $id; } } if ($width == 0 || scalar(@{$bitfields}) != $width) { #skip bad lines push(@errors, errmsg($line + 1, $seqs[$line], "Bad width. Sample skipped.", 0)); next; } for (my $pos = 0; $pos < $width; $pos++) { my $bits = $bitfields->[$pos]; my $fract = $sites_per_sample / bits_set($bits); for (my $a = 0; $a < $asize; $a++) { $matrix[$pos * $asize + $a] += $fract if ($bits & (1 << $a)); # apply seq } } $sample_count++; } # parse line if ($width == 0) { push(@errors, "Motif has no width.\n"); return (undef, \@errors); } my $sites = $sample_count * $sites_per_sample; #initilize the internal motif datastructure my %motif = (id => $id , alt => $alt, url => $url, strands => $strands, width => $width, sites => $sites, pseudo => $pseudo_total, bg => $bg, evalue => 0, pspm => {}); my $motif_pspm = $motif{pspm}; my $total = $sites + $pseudo_total; for (my $a = 0; $a < $asize; $a++) { my $residue = $residues[$a]; my $pseudo_count = $bg->{$residue} * $pseudo_total; $motif_pspm->{$residue} = []; for (my $pos = 0; $pos < $width; $pos++) { $motif_pspm->{$residue}->[$pos] = ($pseudo_count + $matrix[$pos * $asize + $a]) / $total; } } unless ($id) { if ($sample_count > 1) { # if multiple samples were used then convert to iupac $name = intern_to_iupac(\%motif); } $motif{id} = $name; } return (\%motif, \@errors); } # # Converts a motif in matrix form into the internal format. # # Usage: # matrix_to_intern( # \%bg, # $matrix, # $orientation, # $site_count, # $pseudo_total, # alt => $alt # ); # # Parameters # bg - a reference to the background model also defines if motif is dna or protein # matrix - a matrix; columns are space separated and rows are newline separated # orientation - the matrix orientation; allowed values are 'auto', 'col' and 'row' # site count - the number of sites in the matrix; ignored for count matrices # pseudo total - a pseudocount that is distributed according to the background model # # Options # id - use value for motif name # alt - use value for motif alternate name # strands - only relevant to dna; the strands used to create the motif # url - use value for motif information website # rescale - force the use of the site count, even when the matrix is already counts # sub matrix_to_intern { my ($bg, $matrix, $orientation, $site_count, $pseudo_total, %opt) = @_; my ($id, $alt, $strands, $url, $rescale) = ($opt{id}, $opt{alt}, $opt{strands}, $opt{url}, $opt{rescale}); $strands = ($bg->{dna} ? 2 : 1) unless defined $strands; $alt = '' unless defined($alt); $rescale = 0 unless defined($rescale); #initilize the internal motif datastructure #NB width refers to the motif width not the matrix width my %motif = (id => $id, alt => $alt, url => $url, strands => $strands, width => 0, sites => 0, pseudo => $pseudo_total, bg => $bg, evalue => 0, pspm => {}); my $motif_pspm = $motif{pspm}; # lookups my @residues = ($bg->{dna} ? @dna_alphabet : @protein_alphabet); my $asize = scalar(@residues); # dimensions (set defaults) my $height = 0; my $width = ($orientation eq 'row' ? $asize : -1); # parse matrix into an array my $total = 0; my @matrix_array = (); my @errors = (); my @lines = split(/\n/, $matrix); for (my $line_i = 0; $line_i < scalar(@lines); $line_i++) { my $line = $lines[$line_i]; $line =~ s/^\s+//;#trim left $line =~ s/\s+$//;#trim right # skip empty lines next if $line eq ''; my @nums = split(/\s+/, $line); if ($width == -1) { $width = scalar(@nums); #expected width } elsif (scalar(@nums) != $width) { push(@errors, "Error: Expected $width elements on line $line_i but got " . scalar(@nums) . "."); return (undef, \@errors); } # count this row $height++; for (my $num_i = 0; $num_i < scalar(@nums); $num_i++) { my $num = parse_double($nums[$num_i]); if (not defined $num) { push(@errors, "Warning: Element $num_i is not a number."); $num = 0; #specify default } push(@matrix_array, $num); $total += $num; } } # check for an empty matrix if ($height == 0) { push(@errors, "Error: Empty matrix."); return (undef, \@errors); } # check if the dimensions are plausible if ($orientation eq 'col' && $height != $asize) { push(@errors, "Error: Expected $asize rows but got $height."); return (undef, \@errors); } if ($orientation eq 'auto' && $width != $asize && $height != $asize) { push(@errors, "Error: Expected either the row or column count to be the alphabet size $asize but got $width by $height."); return (undef, \@errors); } # now determine the orientation and transform the matrix to a row matrix if ($orientation eq 'auto') { if ($width == $asize && $height == $asize) { # need to guess orientation so use variance my $avg = $total / $asize; my $row_variance = 0; my $col_variance = 0; for (my $i = 0; $i < $asize; $i++) { my $row_sum = 0; my $col_sum = 0; for (my $j = 0; $j < $asize; $j++) { $row_sum += $matrix_array[$i * $width + $j]; $col_sum += $matrix_array[$j * $width + $i]; } $row_variance += ($row_sum - $avg) ** 2; # row sum to the power of 2 $col_variance += ($col_sum - $avg) ** 2; # col sum to the power of 2; } $row_variance /= $asize; $col_variance /= $asize; if ($row_variance > $col_variance) { # probably a column matrix @matrix_array = transpose_matrix(\@matrix_array, $width, $height); } } elsif ($height == $asize) { # dimensions suggest column matrix @matrix_array = transpose_matrix(\@matrix_array, $width, $height); $height = $width; $width = $asize; } } elsif ($orientation eq 'col') { @matrix_array = transpose_matrix(\@matrix_array, $width, $height); $height = $width; $width = $asize; } #should now have a row matrix if ($rescale) { if (not defined($site_count)) { push(@errors, "Error: Rescale option requires a site count. Can't continue without a site count."); return (undef, \@errors); } } else { #detect probability matrix my $row_avg = $total / $height; if (int($row_avg + 0.5) > 1) { # probably a count matrix $site_count = int($row_avg + 0.5); #int truncates toward zero so this rounds for positive numbers } elsif (not defined($site_count)) { push(@errors, "Error: Expected count matrix but got probability matrix. Can't continue without site count."); return (undef, \@errors); } } #set sites and width $motif{sites} = $site_count; $motif{width} = $height; # initialize the counts with the pseudocounts foreach my $residue (@residues) { my $pseudocount = $bg->{$residue} * $pseudo_total; for (my $i = 0; $i < $width; $i++) { $motif_pspm->{$residue}->[$i] = $pseudocount; } } # normalise each row individually my $row_total = $site_count + $pseudo_total; for (my $y = 0; $y < $height; ++$y) { my $sum = 0; #calculate this row's sum for (my $x = 0; $x < $width; ++$x) { $sum += $matrix_array[$y*$width + $x]; } if ($sum == 0) { push(@errors, "Error: Motif position $y summed to zero."); return (undef, \@errors); } # normalise for (my $x = 0; $x < $width; ++$x) { $matrix_array[$y*$width + $x] /= $sum; } # copy to motif for (my $x = 0; $x < $width; ++$x) { my $residue = $residues[$x]; # add counts $motif_pspm->{$residue}->[$y] += $matrix_array[$y*$width + $x] * $site_count; # convert to probabilities $motif_pspm->{$residue}->[$y] /= $row_total; } } unless (defined $id) { $id = intern_to_iupac(\%motif); $motif{id} = $id; } # now make motif return (\%motif, \@errors); } # # intern_to_iupac( # $motif # ); # # Measures the eucledian distance between each motif position and each iupac letter # and selects the closest letter. # sub intern_to_iupac { my ($motif) = @_; my %bg = %{$motif->{bg}}; my $sites = $motif->{sites}; my $pseudo = $motif->{pseudo}; my @residues = ($bg{dna} ? @dna_alphabet : @protein_alphabet); my @iupac = ($bg{dna} ? @dna_iupac : @protein_iupac); my %bits = ($bg{dna} ? %dna_bits : %protein_bits); my $alpha_ic = ($bg{dna} ? $dna_ic : $protein_ic); # for protein allowing ambiguous characters tends to result # in a IUPAC motif of all Xs which is not very informative. my $disable_ambiguous = not $bg{dna}; #calculate iupac code ic's assuming same sites and background as motif my @iupac_data = (); foreach my $code (@iupac) { my $set = $bits{$code}; my $counts = $sites / &bits_set($set); my $ic = $alpha_ic; my %probs; my %fractions; next if $disable_ambiguous and &bits_set($set) > 1; for (my $r = 0; $r < scalar(@residues); $r++) { my $residue = $residues[$r]; my $p = (($set & 1<<$r ? $counts : 0) + $bg{$residue} * $pseudo) / ($sites + $pseudo); $probs{$residue} = $p; $ic -= ($p ? ($p * (log($p) / $LN2)) : 0); } foreach my $residue (@residues) { $fractions{$residue} = $probs{$residue} * $ic; } push(@iupac_data, {code => $code, fractions => \%fractions}); } my $out = ''; for (my $i = 0; $i < $motif->{width}; $i++) { # calculate stack ic my $stack_ic = $alpha_ic; for (my $a = 0; $a < scalar(@residues); $a++) { my $residue = $residues[$a]; my $p = $motif->{pspm}->{$residue}->[$i]; next if $p == 0; $stack_ic -= ($p * (log($p) / $LN2)); } my $stack_fraction = $stack_ic / $alpha_ic; my $best_ed = 100; my $best_code = '?'; foreach my $data (@iupac_data) { my $fractions = $data->{fractions}; my $ed = 0; foreach my $residue (@residues) { my $f1 = $motif->{pspm}->{$residue}->[$i] * $stack_fraction; my $f2 = $fractions->{$residue}; $ed += ($f1 -$f2) ** 2 } $ed = $ed ** 0.5; if ($ed < $best_ed) { $best_ed = $ed; $best_code = $data->{code}; } } $out .= $best_code; } return $out; } # # intern_to_meme( # $motif, # $add_pssm, # $add_pspm, # $add_header # ); # # Returns a motif in minimal meme format with the header optimal so multiple motifs can be concatenated. # Warning: MEME does special stuff with pseudo counts on proteins to generate the log odds matrix which # this approach does not do! # sub intern_to_meme { my ($motif, $add_pssm, $add_pspm, $add_header) = @_; my %bg = %{$motif->{bg}}; my @residues = ($bg{dna} ? @dna_alphabet : @protein_alphabet); my $output = ""; # # get the text for the PSPM (and PSSM) # my $log_odds = "log-odds matrix: alength= ".scalar(@residues). " w= ".$motif->{width}." E= ".$motif->{evalue}."\n"; my $letter_prob = "letter-probability matrix: alength= ".scalar(@residues). " w= ".$motif->{width}." nsites= ".$motif->{sites}." E= ".$motif->{evalue}."\n"; for (my $i = 0; $i < $motif->{width}; $i++) { foreach my $residue (@residues) { #get the probability my $prob = $motif->{pspm}->{$residue}->[$i]; #append to the letter probability matrix $letter_prob .= sprintf("%10.6f\t", $prob); my $score; if ($motif->{pssm}) { $score = $motif->{pssm}->{$residue}->[$i]; } else { #calculate the log odds as log of zero is undefined give it a value of -10000 (TODO check does this make sense) $score = $prob > 0 ? round((log($prob/$bg{$residue}) / log(2.0)) * 100) : -10000; } #append to the log odds matrix $log_odds .= sprintf("%6d\t", $score); } $letter_prob .= "\n"; $log_odds .= "\n"; } $letter_prob .= "\n"; $log_odds .= "\n"; # Print the motif. $output .= meme_header($motif->{bg}, $motif->{strands}) if $add_header; $output .= "MOTIF ".$motif->{id}." ".$motif->{alt}."\n\n"; $output .= $log_odds if ($add_pssm); $output .= $letter_prob if ($add_pspm); $output .= "URL ".$motif->{url}."\n\n" if ($motif->{url}); return($output); } # # intern_to_pspm( # $motif # ); # # Returns a motif pspm section suitable for passing to javascript # sub intern_to_pspm { my ($motif) = @_; my %bg = %{$motif->{bg}}; my @residues = ($bg{dna} ? @dna_alphabet : @protein_alphabet); my $letter_prob = "letter-probability matrix: alength= ".scalar(@residues). " w= ".$motif->{width}." nsites= ".$motif->{sites}." E= ".$motif->{evalue}."\n"; for (my $i = 0; $i < $motif->{width}; $i++) { foreach my $residue (@residues) { #get the probability my $prob = $motif->{pspm}->{$residue}->[$i]; #append to the letter probability matrix $letter_prob .= sprintf("%10.6f\t", $prob); } $letter_prob .= "\n"; } $letter_prob .= "\n"; return $letter_prob; } # # intern_to_bg( # $motif # ); # # Return a motif background suitable for passing to javascript # sub intern_to_bg { my ($motif) = @_; my $output = ''; my %bg = %{$motif->{bg}}; my @residues = ($bg{dna} ? @dna_alphabet : @protein_alphabet); foreach my $residue (@residues) { $output .= sprintf("%s %.5f ", $residue, $bg{$residue}); } return $output; } # # read_background_file() # # Read a background file in fasta-get-markov format. # If $bg_file is not defined, sets background to uniform. # Contains the key/value 'source' that has the background # source description. # # Returns background as a hash. # sub read_background_file { my ($is_dna, $bg_file) = @_; # get the letters in the alphabet my @letters = ($is_dna ? @dna_alphabet : @protein_alphabet); my $alph_length = scalar @letters; my (%bg, $a, $f); $bg{dna} = $is_dna; # initialize the background to uniform if no file given if (! defined $bg_file) { $bg{source} = "uniform background"; foreach $a (@letters) { $bg{$a} = 1.0/$alph_length; } return(%bg); } # read the background file $bg{source} = "file `$bg_file'"; open(BG_FILE, $bg_file) || die("Can't open $bg_file.\n"); my $total_bg = 0; while () { next if (/^#/); # skip comments ($a, $f) = split; next unless (length($a) == 1); # skip higher order model $a =~ y/a-z/A-Z/; $bg{$a} = $f; $total_bg += $f; } close BG_FILE; # make sure they sum to 1 by normalizing foreach my $key (@letters) { die ("The letter '$key' was not given a value in the background file: $bg_file\n") unless (defined $bg{$key}); $bg{$key} /= $total_bg; } return(%bg); } # background file # # meme_header( # \%background, # $strands # ); # # Returns the header part of minimal meme format. # Strands is only used if the motif is dna and defaults to 2. # # sub meme_header { my ($bg, $strands) = @_; $strands = 2 unless defined $strands and $strands == 1; my @residues = ($bg->{dna} ? @dna_alphabet : @protein_alphabet); # Print the file MEME header. my $output; $output = "MEME version 4.4\n\n"; $output .= "ALPHABET= ".join('',@residues)."\n\n"; $output .= "strands: +\n\n" if ($bg->{dna} && $strands == 1); $output .= "strands: + -\n\n" if ($bg->{dna} && $strands == 2); $output .= "Background letter frequencies (from " . ($bg->{source} ? $bg->{source} : "unknown source") . "):\n"; foreach my $residue (@residues) { $output .= sprintf("%s %.5f ", $residue, $bg->{$residue}); } $output .= "\n\n"; return($output); } # # motif_id( # $motif # ); # # Returns the motif id # sub motif_id { my $motif = shift; return $motif->{id}; } # # motif_width( # $motif # ); # # Returns the width of the motif # sub motif_width { my $motif = shift; return $motif->{width}; } # # parse_double # # Parses a number using strtod # sub parse_double { my $str = shift; $str =~ s/^\s+//;#trim left $str =~ s/\s+$//;#trim right $! = 0; #clear errno my($num, $unparsed) = strtod($str); if (($str eq '') || ($unparsed != 0) || $!) { return undef; } else { return $num; } } sub round { # int truncates which is the same as always rounding toward zero if ($_[0] > 0) { return int($_[0] + 0.5); } else { return int($_[0] - 0.5); } } # # transpose_matrix # # returns a transposed copy of the matrix. # sub transpose_matrix { my ($matrix_ref, $width, $height) = @_; my @matrix = @$matrix_ref; die("Dimensions don't match matrix size.") unless scalar(@matrix) == $width * $height; my @copy = @matrix; my $copy_width = $height; my $copy_height = $width; for (my $x = 0; $x < $width; ++$x) { for (my $y = 0; $y < $height; ++$y) { my $copy_x = $y; my $copy_y = $x; $copy[$copy_x + $copy_y * $copy_width] = $matrix[$x + $y * $width]; } } return @copy; } # # seq_to_bitfield # # returns an array of bitfields representing the sequence. # sub seq_to_bitfield { my ($seq, $alpha, $nums, $lineno) = @_; #error positions for making error msg my @unrecognised = (); my @unexpected_open = (); my @unexpected_close = (); my @nocontent = (); #parse the iupac codes and bracket expressions into an array of bitfields my @bitfields = (); my @chars = split(//, $seq); my $inside_bracket = 0; my $bracket_value; my $bracket_start; for (my $i = 0; $i < scalar(@chars); $i++) { my $char = $chars[$i]; if (!$inside_bracket) { #not inside a bracket expression if ($char eq '[') { #found the start of a bracket expression $inside_bracket = 1; $bracket_value = 0; $bracket_start = $i; } elsif ($char =~ m/^([$alpha])$/i) { #found a normal character push(@bitfields, $nums->{uc($1)}); #add it to the bitfields } elsif ($char =~ m/^\s$/i) { #found whitespace #ignore } elsif ($char eq ']') { #misplaced bracket push(@unexpected_close, $i); #add error } else { #unrecognised character push(@unrecognised, $i); #add error } } else { #inside a bracket expression if ($char eq ']') { #found the end of a bracket expression $inside_bracket = 0; if ($bracket_value > 0) { #bracket had content so add it to the bitfields push(@bitfields, $bracket_value); } else { #no valid characters within the bracket expression push(@nocontent, $bracket_start); } } elsif ($char =~ m/^([$alpha])$/i) { #found a normal character $bracket_value |= $nums->{uc($1)}; #add to the set of options that the bracket expression could equal } elsif ($char =~ m/^\s$/i) { #found whitespace #ignore } elsif ($char eq '[') { #misplaced bracket push(@unexpected_open, $i); #add error } else { #unrecognised character push(@unrecognised, $i); #add error } } } if ($inside_bracket) { #missing close bracket if ($bracket_value > 0) { #bracket had content so add it to the bitfields push(@bitfields, $bracket_value); } else { #no valid characters within the bracket expression push(@nocontent, $bracket_start); #add error } } my @errors = (); push(@errors, errmsg($lineno, $seq, "Unrecognised", @unrecognised)); push(@errors, errmsg($lineno, $seq, "Unexpected ']'", @unexpected_close)); push(@errors, errmsg($lineno, $seq, "Unexpected '['", @unexpected_open)); push(@errors, errmsg($lineno, $seq, "No content in '[...]'", @nocontent)); push(@errors, errmsg($lineno, $seq, "Missing ']'", $bracket_start)) if $inside_bracket; return (\@bitfields, \@errors); } # # errmsg # # outputs two lines, the first showing the line where the problem occured, # the second showing the positions of the error with ^ and the error message. # sub errmsg { my ($lineno, $line, $msg, @errors) = @_; return () unless @errors; my $pos = 0; my $indent = ' ' x length("$lineno: "); my $str = "$lineno: $line\n$indent"; foreach my $i (@errors) { next if $i < $pos; $str .= ' ' x ($i - $pos); $str .= '^'; $pos = $i + 1; } $str .= " $msg"; return ($str); } # # bitfield_to_seq # # Converts an array of bitfields into a equalivent sequence making use of IUPAC codes and regular expression bracket expressions. # sub bitfield_to_seq { my ($is_dna, $bitfields) = @_; my $width = scalar(@{$bitfields}); # generate a seq my $seq = ""; if ($is_dna) { # it's easy for dna because there's an iupac code for every possible combination for (my $i = 0; $i < $width; $i++) { $seq .= $dna_iupac[$bitfields->[$i] - 1]; #arrays index from zero... } } else { my $asize = scalar(@protein_alphabet); # for protein there are only 3 special codes which do not cover every combination for (my $i = 0; $i < $width; $i++) { my $bits = $bitfields->[$i]; my $guess = $protein_lookup{$bits}; # attempt to find a single code if ($guess) { $seq .= $guess; } else { #some combination will be needed my $hasB = !(~$bits & $protein_bits{'B'}); my $hasZ = !(~$bits & $protein_bits{'Z'}); $bits &= ~$protein_bits{'B'} if $hasB; #turn off bits for B $bits &= ~$protein_bits{'Z'} if $hasZ; #turn off bits for Z $seq .= '['; for (my $j = 0; $j < $asize; $j++) { $seq .= $protein_alphabet[$j] if ($bits & (1 << $j)); } $seq .= 'B' if $hasB; $seq .= 'Z' if $hasZ; $seq .= ']'; } } } return $seq; } # # bits # # returns the number of set bits # sub bits_set { my $num = shift; my $count = 0; for (; $num > 0; $num = $num>>1) { ++$count if ($num & 1); } return $count; }