#!@WHICHPERL@ -w
#
# FILE: nmica2meme
# AUTHOR: Timothy L. Bailey
# CREATE DATE: 16/10/2010
# DESCRIPTION: Convert a file containing list of TFBS motif matrices from 
# nestedMICA (BioTiffin/XMS) format to MEME output format. 

use warnings;
use strict;

use lib qw(@PERLLIBDIR@);

use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file);

use Fcntl qw(O_RDONLY);
use Getopt::Long;
use Pod::Usage;
use XML::Simple;
use Data::Dumper;

=head1 SYNOPSIS

nmica2meme [options]

 Options: 
  -skip <ID>                    skip this ID (may be repeated)
  -numseqs <numseqs>            assume frequencies based on this many
                                sequences; default: 20
  -truncate_names               truncate motif names at first underscore
  -numbers                      use numbers instead of strings as motif names
  -bg <background file>         file with background frequencies of letters;
                                default: uniform background
  -pseudo <total pseudocounts>  add <total pseudocounts> times letter
                                background to each frequency; default: 0
  -logodds                      print log-odds matrix, too;
                                default: print frequency matrix only
  -url <website>                website for the motif; The untruncated ID is
                                substituted for MOTIF_NAME; default: no url
  -h                            print usage message

  Read a nestedMICA (BioTiffin/XMS) matrix file and convert to MEME format.

  Reads standard input.
  Writes standard output

=cut

# Set option defaults
my %skips;
my $num_seqs = 20;
my $truncate_names = 0;
my $use_numbers = 0;
my $bg_file;
my $pseudo_total = 0; # default total pseudocounts
my $print_logodds = 0;
my $url_pattern = '';
my $help;
# XMS base names
my @bases = ('adenine', 'cytosine', 'guanine', 'thymine');

GetOptions("skip=s" => sub {$skips{$_[1]} = 1}, "numseqs=i" => \$num_seqs,
  "truncate_names" => \$truncate_names, "numbers" => \$use_numbers, 
  "bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds,
  "url=s" => \$url_pattern, "h" => \$help) or pod2usage(2);
#check num seqs
pod2usage("Option -numseqs must have a positive value.") if ($num_seqs < 0);
#check pseudo total
pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0);

# output help if requested
pod2usage(1) if $help;

# Get the background model.
my %bg = &read_background_file(1, $bg_file); #DNA background file

# Read the input file.
my $xml = new XML::Simple;
my $data = $xml->XMLin('-');		# read from standard input

#print Dumper($data);

my $n = @bases;
my $num_motifs = 0;
foreach my $motif_name (sort keys %{$data->{motif}}) {
  my $weight_matrix = $data->{motif}->{$motif_name}->{weightmatrix};
  my $w = $weight_matrix->{columns};
  my $alt = "";
  my $matrix = '';
  # loop over columns of motif
  foreach my $column (@{$weight_matrix->{column}}) {
   # get the frequencies for this column
    my %freqs;
    foreach my $entry (@{$column->{weight}}) {
      $freqs{$entry->{symbol}} = $entry->{content};
    }
    # create an ascii matrix
    for (my $i=0; $i<$n; $i++) {
      my $base = $bases[$i];
      $matrix .= " " . $freqs{$base}; 
    }
    $matrix .= "\n";
  }
  # convert and print ascii matrix
  my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'row', $num_seqs, $pseudo_total, id => $motif_name, alt => $alt, url => '');
  print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif;
  print(STDERR join("\n", @{$errors}), "\n") if @{$errors};
}

my $num_skipped = 0;

print(STDERR "Converted $num_motifs motifs.\n");
print(STDERR "Skipped $num_skipped motifs.\n");
