#! /usr/bin/perl -w

use strict;
use Data::Dumper;
use Bio::Tools::CodonTable;
use FindBin qw($Bin);  #$Bin is the directory with the script
use lib "$Bin";        #add bin to the library path
use shared;

my $USAGE = "$0 mrna.fa genome_table min_coverage output_basename";
die $USAGE unless @ARGV==4;

my($in_fa, $in_table, $min_coverage, $out_base)=@ARGV;

#read table of genetic codes
my $species2code={};
my $in;
open $in, "<", $in_table or die;
while(my $line = <$in>) {
    chomp $line;
    my @parts = split "\t", $line;
    die unless @parts>5;
    if($line =~/^\#/) {
	die unless $parts[5] eq "Genetic code";
	next;
    }
    next if($parts[0] eq "");
    die unless $parts[5]eq "3" || $parts[5] eq "4";
    #die if exists $species2code->{lc(substr $parts[0], 0, 5)};
    $species2code->{lc(substr $parts[0], 0, 5)} = $parts[5];
}
close $in or die;

#read fasta and write it out translated
#keep sequences in memory
my $out;
my $prot_fa = $out_base . ".prot_fa";
my $species2dna={};
my $species2name={};
my $species_list = [];
open $in, "<", $in_fa or die;
open $out, ">", $prot_fa or die;
while(1) {
    my ($name, $seq) = read_fasta($in);
    last unless defined $name;
    my $species = get_species($name, $species2code);
    die "Uknown species $species" unless exists $species2code->{$species};
    my $prot = translate($$seq, $species2code->{$species});
    write_fasta($out, $name, \$prot);
    $species2dna->{$species} = $$seq;
    $species2name->{$species} = $name;
    push @$species_list, $species;
}
close $in or die;
close $out or die;

#run muscle on proteins 
my $prot_aln_tmp = $out_base . ".prot_aln_tmp";
my_run("muscle -in $prot_fa -out $prot_aln_tmp");

#read alignment to a hash
my $species2protaln={};
open $in, "<", $prot_aln_tmp or die;
while(1) {
    my ($name, $seq) = read_fasta($in);
    last unless defined $name;
    my $species = get_species($name, $species2code);
    $species2protaln->{$species} = $$seq;
}
close $in or die;

#find which columns have sufficient coverage
my $counts = [];
foreach my $seq (values %$species2protaln) {
    if(scalar(@$counts)==0) {
	@$counts = ((0) x length($seq));
    }
    die unless scalar(@$counts)==length($seq);
    for(my $pos=0; $pos<length($seq); $pos++) {
	if(substr($seq, $pos, 1) ne '-') {
	    $counts->[$pos]++;
	}
    }
}
my $good_col = [];
my $good_col_dna = [];
my $num_seq = scalar keys %$species2protaln;
for(my $i=0; $i<@$counts; $i++) {
    if($counts->[$i] >= $num_seq * $min_coverage) {
	push @$good_col, 1;
	push @$good_col_dna, 1, 1, 1;
    }
    else {
	push @$good_col, 0;
	push @$good_col_dna, 0, 0, 0;
    }
}

#write only selected columns of alignment
my $prot_aln = $out_base . ".prot_aln";
write_alignment($species2protaln, $species2name, $species_list, $good_col, 
		$prot_aln);
#convert alignment to phylip for a good measure
my $prot_phy = $out_base . ".prot_phy";
my_run("readseq -f12 $prot_aln -o=$prot_phy -a");


#create translated alignment and wobble bases
my $species2dnaaln={};
my $species2wobblealn={};
foreach my $species (keys %$species2protaln) {
    my $protaln = $species2protaln->{$species};
    my $dna = $species2dna->{$species};
    my $dnapos=0;
    my $dnaaln = "";
    my $wobblealn = "";
    for(my $pos=0; $pos<length($protaln); $pos++) {
	if(substr($protaln, $pos, 1) eq '-') {
	    $dnaaln .= "---";
	    $wobblealn .= "-";
	}
	else {
	    $dnaaln .= substr($dna, $dnapos, 3);
	    $wobblealn .= substr($dna, $dnapos+2, 1);
	    $dnapos += 3;
	}
    }
    die unless length($dna)==$dnapos;
    $species2dnaaln->{$species} = $dnaaln;
    $species2wobblealn->{$species} = $wobblealn;
}

#write dna alignment and convert to phylip
my $dna_aln = $out_base . ".dna_aln";
write_alignment($species2dnaaln, $species2name, $species_list, $good_col_dna, 
		$dna_aln);
my $dna_phy = $out_base . ".dna_phy";
my_run("readseq -f12 $dna_aln -o=$dna_phy -a");


#write wobble alignment and convert to phylip
my $wobble_aln = $out_base . ".wobble_aln";
write_alignment($species2wobblealn, $species2name, $species_list, $good_col, 
		$wobble_aln);
my $wobble_phy = $out_base . ".wobble_phy";
my_run("readseq -f12 $wobble_aln -o=$wobble_phy -a");


exit 0;

##############
sub write_alignment {
    my ($species2aln, $species2name, $species_list, $good_col, $filename) = @_;

    my $out;
    open $out, ">", $filename or die;
    foreach my $species (@$species_list) {
	my $orig = $species2aln->{$species};
	my $new = "";
	die unless length($orig)==@$good_col;
	for(my $pos=0; $pos<length($orig); $pos++) {
	    if($good_col->[$pos]) {
		$new .= substr($orig, $pos, 1);
	    }
	}
	write_fasta($out, $species2name->{$species}, \$new);
    }
    close $out or die;
}


###############
sub get_species {
    my ($name, $species2code) = @_;
    die "Bad name format $name" unless $name=~/^>([^_]+)_/;
    my $species = lc $1;
    die "Uknown species $species" unless exists $species2code->{$species};
    return $species;
}

###############
sub translate
{
    my ($seq, $code) = @_;
    my $CodonTable = Bio::Tools::CodonTable->new( -id => $code);
    my $result = $CodonTable->translate($seq);

    return $result;
}
