#!/usr/bin/perl =head1 NAME UniFrac_to_expanded.pl =head1 AUTHOR(S) Brett Trost =head1 SYNOPSIS UniFrac_to_expanded.pl tree_file abundance_file =head1 DESCRIPTION This script takes as input a Newick-formatted tree file, as well as a file containing the abundance of each OTU for each sample, and outputs a new tree file and abundance file suitable for use with weighted Unifrac. In the new tree, each instance of an OTU is labeled separately. For example, if there are 3 instances of OTU X in sample A, and 2 instances in sample B, then in the tree, "X" will be replaced by "(X_1:0,X_2:0,X_3:0,X_4:0,X_5:0)". The corresponding entries in the new abundance file will look like this: X_1 A 1 X_2 A 1 X_3 A 1 X_4 B 1 X_5 B 1 The new tree file will have the same name as the input tree file, except with ".expanded" on the end. The new abundance file will have the same name as the input abundance file, except with ".expanded" on the end. =head1 INPUT tree_file: a file representing a phylogenetic tree, in Newick format. For example: (A:0.1,B:0.1); abundance_file: a file containing abundance information for each OTU and sample. For example: A Sample1 4 A Sample2 0 B Sample1 0 B Sample2 4 =head1 OUTPUT Given the above input, two output files would be created. The tree file would look like this: ((A_1:0,A_2:0,A_3:0,A_4:0):0.1,(B_1:0,B_2:0,B_3:0,B_4:0):0.1); The abundance file would look like this: A_1 Sample1 1 A_2 Sample1 1 A_3 Sample1 1 A_4 Sample1 1 B_1 Sample2 1 B_2 Sample2 1 B_3 Sample2 1 B_4 Sample2 1 =head1 ASSUMPTIONS Assumes that the OTU names do not contain underscores. =head1 DATA tree_file: /birl/MAVEN/data/Num_Samples_Test/Sub_1000/QIIME_OUT/rep_set_n5.tre abundance_file: /birl/MAVEN/data/Num_Samples_Test/Sub_1000/QIIME_OUT/unifrac_table.txt =head1 EXAMPLE UniFrac_to_expanded.pl /birl/MAVEN/data/Num_Samples_Test/Sub_1000/QIIME_OUT/rep_set_n5.tre /birl/MAVEN/data/Num_Samples_Test/Sub_1000/QIIME_OUT/unifrac_table.txt =head1 SEE ALSO UniFrac_to_compact.pl =cut use strict; use warnings; my $tree_file = $ARGV[0]; my $abundances_file = $ARGV[1]; open TREE_FILE, $tree_file; chomp(my $tree_string = ); close TREE_FILE; open ABUNDANCES_FILE, $abundances_file; chomp(my @lines = ); close ABUNDANCES_FILE; my %abundances; foreach my $line (@lines) { my ($OTU, $sample, $count) = split "\t", $line; $abundances{$OTU}{$sample} = $count; } open COUNTS_OUTPUT, ">$abundances_file.expanded"; foreach my $OTU (keys %abundances) { my $OTU_string; my $total_count = 0; foreach my $sample (keys %{$abundances{$OTU}}) { for my $i (1..$abundances{$OTU}{$sample}) { $total_count++; if ($total_count == 1) { $OTU_string = "(${OTU}_$total_count:0"; } else { $OTU_string .= ",${OTU}_$total_count:0"; } print COUNTS_OUTPUT "${OTU}_$total_count\t$sample\t1\n"; } } $OTU_string .= ")"; $tree_string =~ s/([(,])\s*$OTU:/$1$OTU_string:/g; } close COUNTS_OUTPUT; open TREE_OUTPUT, ">$tree_file.expanded"; print TREE_OUTPUT "$tree_string\n"; close TREE_OUTPUT;