#!/usr/bin/perl

#Only for dipeptide composition

use strict;
use Switch;
use Tie::IxHash;
use Getopt::Std;

my $argcnt = $#ARGV + 1;
if ($argcnt < 2 ){
	usage();
	exit;
}

my $FILE =$ARGV[0];
my $MODE= $ARGV[1];
my $OUT = $ARGV[2];
my $NUM=$ARGV[3];



if ($MODE eq "dpcomp") {
	write_dpcomp($FILE);
}
else { 
	print "Mode $MODE doesn't exist. Please try with a valid mode!\n";
}



exit;


################
# SUB ROUTINES #
################



###############USAGE###################################################
sub usage {
     my $program = `basename $0`;
     chomp($program);
     print STDERR "
     USAGE:
     
     $program  [ sequence file ]  [ coding mode] 

     Codes the protein sequence using the selected coding mode

     -Input sequence file
     -Coding mode [dbcomp]
           
           

####################################################################### 

";
}

##### WRITE TESTING FILES ####################

sub write_dpcomp {
        my ( $file ) = @_;
        my ( @array) = (); my @coded_pep = ();my @coded_seqs;
        my $coded_seq;
        open(OUT, ">test_dpcomp.arff") || die;
		print OUT "\@relation dpcomp\n";
		print OUT "\n";
		for my $i ( 1 .. 400 ){
		print OUT "\@attribute pos_$i real\n";
		}
		print  OUT "\@attribute class \{Y,N\}\n";
    	print  OUT "\@data\n\n";
        open(F, "$file") || die;
        while( <F> ){
                @array = split(/\s+/,$_);
                my $lg=length($array[0]);
                my $nter = substr($array[0],0,3);
                my $cter = substr($array[0],$lg-1,1);
 
				my $pepstr= $array[0];
                ( @coded_pep ) = dp_composition($pepstr);
                
                $coded_seq = join "," , @coded_pep;

                $coded_seq = $coded_seq.",?\n";
                print OUT $coded_seq;
                @array = @coded_pep = (); $coded_seq = $pepstr =  $nter = $cter = "";
        }
        close(F);
        close(OUT);
        
}


############### DP COMPOSITION #################################
sub dp_composition {
        my ($seq) = @_;
        my $lseq = length($seq);
        my ($i, $j, $k, $dpf, $l, $c, $dp, $dpn, $aa_aa) = ();
        my @dp = ();
        my @dp_comp = ();
        my @aa_cod= qw(A C D E F G H I K L M N P Q R S T V W Y);
    ####### get all dp
    for($i=0; $i < $lseq-1;$i++){
        $dp = substr($seq, $i, 2);
        ######## $dp in array #######
        $dp[$i]=$dp;
    }
        $dpn = scalar @dp;
        for $j ( 0 .. $#aa_cod) {
                for $k ( 0 .. $#aa_cod) {
                        $aa_aa = $aa_cod[$j].$aa_cod[$k];
                        $c = 0;$dpf = 0;
                        for($l = 0; $l < @dp; $l++){
                        if ($aa_aa eq $dp[$l] ){
                                $c++;
                        }
                }
                
              if ($dpn eq 0){
                	$dpf=0;
                } else {
                        $dpf = $c/$dpn;
                }
                        $dpf = sprintf"%.4f", $dpf;             
                        push @dp_comp, $dpf;
                }
    }
        $c = "";
        return (@dp_comp);
}

