#!/usr/bin/perl

=head1 NAME

UniFrac_to_expanded.pl

=head1 AUTHOR(S)

Brett Trost

=head1 SYNOPSIS

UniFrac_to_expanded.pl tree_file abundance_file

=head1 DESCRIPTION

This script takes as input a Newick-formatted tree file, as well as a file containing
the abundance of each OTU for each sample, and outputs a new tree file and abundance file
suitable for use with weighted Unifrac.  In the new tree, each instance of an OTU is labeled separately.
For example, if there are 3 instances of OTU X in sample A, and 2 instances in sample B, then in the tree, 
"X" will be replaced by "(X_1:0,X_2:0,X_3:0,X_4:0,X_5:0)".  The corresponding entries in the new abundance file will
look like this:

X_1	A	1
X_2	A	1
X_3	A	1
X_4	B	1
X_5	B	1

The new tree file will have the same name as the input tree file, except with ".expanded" on the end.
The new abundance file will have the same name as the input abundance file, except with ".expanded" on the end.

=head1 INPUT

tree_file: a file representing a phylogenetic tree, in Newick format.  For example:

(A:0.1,B:0.1);

abundance_file: a file containing abundance information for each OTU and sample.  For example:

A	Sample1	4	
A	Sample2	0	
B	Sample1	0	
B	Sample2	4

=head1 OUTPUT

Given the above input, two output files would be created.  The tree file would look like this:

((A_1:0,A_2:0,A_3:0,A_4:0):0.1,(B_1:0,B_2:0,B_3:0,B_4:0):0.1);

The abundance file would look like this:

A_1	Sample1	1
A_2	Sample1	1
A_3	Sample1	1
A_4	Sample1	1
B_1	Sample2	1
B_2	Sample2	1
B_3	Sample2	1
B_4	Sample2	1

=head1 ASSUMPTIONS

Assumes that the OTU names do not contain underscores.

=head1 DATA

tree_file: /birl/MAVEN/data/Num_Samples_Test/Sub_1000/QIIME_OUT/rep_set_n5.tre
abundance_file: /birl/MAVEN/data/Num_Samples_Test/Sub_1000/QIIME_OUT/unifrac_table.txt

=head1 EXAMPLE

UniFrac_to_expanded.pl /birl/MAVEN/data/Num_Samples_Test/Sub_1000/QIIME_OUT/rep_set_n5.tre /birl/MAVEN/data/Num_Samples_Test/Sub_1000/QIIME_OUT/unifrac_table.txt

=head1 SEE ALSO

UniFrac_to_compact.pl

=cut

use strict;
use warnings;

my $tree_file = $ARGV[0];
my $abundances_file = $ARGV[1];

open TREE_FILE, $tree_file;
chomp(my $tree_string = <TREE_FILE>);
close TREE_FILE;

open ABUNDANCES_FILE, $abundances_file;
chomp(my @lines = <ABUNDANCES_FILE>);
close ABUNDANCES_FILE;


my %abundances;

foreach my $line (@lines) {
	my ($OTU, $sample, $count) = split "\t", $line;
	$abundances{$OTU}{$sample} = $count;
}


open COUNTS_OUTPUT, ">$abundances_file.expanded";

foreach my $OTU (keys %abundances) {
	my $OTU_string;
	
	my $total_count = 0;
	
	foreach my $sample (keys %{$abundances{$OTU}}) {
		for my $i (1..$abundances{$OTU}{$sample}) {
			$total_count++;
			
			if ($total_count == 1) {
				$OTU_string = "(${OTU}_$total_count:0";
			} else {
				$OTU_string .= ",${OTU}_$total_count:0";
			}
			
			print COUNTS_OUTPUT "${OTU}_$total_count\t$sample\t1\n";
			
		}
	}
	
	$OTU_string .= ")";
	
	$tree_string =~ s/([(,])\s*$OTU:/$1$OTU_string:/g;
}


close COUNTS_OUTPUT;

open TREE_OUTPUT, ">$tree_file.expanded";
print TREE_OUTPUT "$tree_string\n";
close TREE_OUTPUT;