The Emili Lab Software

#!/usr/bin/perl

##########################################################################
# proteogest.pl
# This program analyses a given proteome and reports the analysis in
# files created on users computer.
#
# The input this program is a valid proteome FASTA file in text format.
# There are few output files produced by this program.
#
# The following are the commandline arguments to this program
# ------------------------------------------------------------------------
# [-i input_file_name]
# this has to be a valid FASTA format file specifying the protins
# in the proteome
# ------------------------------------------------------------------------
# [-d ] creates simple.txt output file
# If -d is spcified on the commandline then a simple file will be
# created. This file has four columns. 1st column is the range
# where the peptide is in the protein. 2nd and 3rd columns are the
# isotopic mass and average mass respectively for each peptide in
# the protein.
# ------------------------------------------------------------------------
# [-a ] creates annotated outputfile
#
# [-c cleavage_citeria(,cleavage_criteria...)]
# [-s ] creates summary file
# [-R ] custom modification residue value
# [-W ] custom modification weight differential
# [-I ] ICAT modification on Cys
# [-M ] MCAT modification on Arg Lys
# [-L ] minimum peptide length
# [-S ] Phosphorylation on Ser (S)
# [-T ] Phosphorylation on Thr (T)
# [-Y ] Phosphorylation on Tyr (Y)
# [-C ] peptide modificaiton:complete
# [-N ] peptide modificaiton:incomplete
# [-G ] number of missed cleavages
# [-P ] include only modified peptides in output
# [-H ] display redundant peptides
# [-h ] help
#
# NOTICE: Miscleavage option is not fully functional. Please read documentation. #
# Authors: Thanuja Premawardena and Shiva Amiri
# Emili Proteomics Lab, University of Toronto
#
##########################################################################

use Term::ANSIColor;
# a constant the mass of water
use constant WATER => 18;

use strict;
srand;

# isotopic masses of 20 amino acids
my $mass_A = 71.03711; my $mass_C = 103.00919; my $mass_D = 115.02694; my $mass_E = 129.04259; my $mass_F = 147.06841;
my $mass_H = 137.05891; my $mass_I = 113.08406; my $mass_K = 128.09496; my $mass_L = 113.08406; my $mass_M = 131.04049;
my $mass_N = 114.04293; my $mass_P = 97.05276; my $mass_Q = 128.05858; my $mass_R = 156.10111; my $mass_S = 87.03203;
my $mass_G = 57.02146; my $mass_T = 101.04768; my $mass_V = 99.06841; my $mass_W= 186.07931; my $mass_Y = 163.06333;

# average masses of 20 amino acids
my $amass_A = 71.0788; my $amass_C = 103.1388; my $amass_D = 115.0866; my $amass_E = 129.1155; my $amass_F = 147.1766;
my $amass_H = 137.1411; my $amass_I = 113.1594; my $amass_K = 128.1741; my $amass_L = 113.1411; my $amass_M = 131.1926;
my $amass_N = 114.1038; my $amass_P = 97.1167; my $amass_Q = 128.1307; my $amass_R = 156.1875; my $amass_S = 87.0782;
my $amass_G = 57.0519; my $amass_T = 101.1051; my $amass_Y = 163.1760; my $amass_V = 99.1326; my $amass_W= 186.2132;

# isotopic masses of 20 amino acids with the incomplete modifications
my $mass_a = 71.03711; my $mass_c = 103.00919 + 8.00; my $mass_d = 115.02694; my $mass_e = 129.04259; my $mass_f = 147.06841;
my $mass_h = 137.05891; my $mass_i = 113.08406; my $mass_k = 128.09496 +
42; my $mass_l = 113.08406; my $mass_m = 131.04049;
my $mass_n = 114.04293; my $mass_p = 97.05276; my $mass_q = 128.05858;
my $mass_r = 156.10111; my $mass_s = 87.03203 + 80.00;
my $mass_g = 57.02146; my $mass_t = 101.04768 + 80.00; my $mass_v = 99.06841;
my $mass_w= 186.07931; my $mass_y = 163.06333 + 80.00;

# average masses of 20 amino acids with the incomplete modifications
my $amass_a = 71.0788; my $amass_c = 103.1388 + 8.00; my $amass_d = 115.0866; my $amass_e = 129.1155; my $amass_f = 147.1766;
my $amass_h = 137.1411; my $amass_i = 113.1594; my $amass_k = 128.1741 +
42.00; my $amass_l = 113.1411; my $amass_m = 131.1926;
my $amass_n = 114.1038; my $amass_p = 97.1167; my $amass_q = 128.1307;
my $amass_r = 156.1875; my $amass_s = 87.0782 + 80.00;
my $amass_g = 57.0519; my $amass_t = 101.1051 + 80.00; my $amass_y = 163.1760 +
80.00; my $amass_v = 99.1326; my $amass_w= 186.2132;

# isotopic masses of 20 amino acids for calculating total protein mass
my $mass_a_P = 71.03711; my $mass_c_P = 103.00919; my $mass_d_P = 115.02694; my $mass_e_P = 129.04259; my $mass_f_P = 147.06841;
my $mass_h_P = 137.05891; my $mass_i_P = 113.08406; my $mass_k_P = 128.09496; my $mass_l_P = 113.08406; my $mass_m_P = 131.04049;
my $mass_n_P = 114.04293; my $mass_p_P = 97.05276; my $mass_q_P = 128.05858; my $mass_r_P = 156.10111; my $mass_s_P = 87.03203;
my $mass_g_P = 57.02146; my $mass_t_P = 101.04768; my $mass_v_P = 99.06841; my $mass_w_P = 186.07931; my $mass_y_P = 163.06333;

# average masses of 20 amino acids for calculating total protein mass
my $amass_a_P = 71.0788; my $amass_c_P = 103.1388; my $amass_d_P = 115.0866; my $amass_e_P = 129.1155; my $amass_f_P = 147.1766;
my $amass_h_P = 137.1411; my $amass_i_P = 113.1594; my $amass_k_P = 128.1741; my $amass_l_P = 113.1411; my $amass_m_P = 131.1926;
my $amass_n_P = 114.1038; my $amass_p_P = 97.1167; my $amass_q_P = 128.1307; my $amass_r_P = 156.1875; my $amass_s_P = 87.0782;
my $amass_g_P = 57.0519; my $amass_t_P = 101.1051; my $amass_y_P = 163.1760; my $amass_v_P = 99.1326; my $amass_w_P = 186.2132;

# the number of each amino acid in each peptide
my $ACount = 0; my $CCount = 0; my $DCount = 0; my $ECount = 0; my $FCount = 0;
my $GCount = 0; my $HCount = 0; my $ICount = 0; my $KCount = 0; my $LCount = 0;
my $MCount = 0; my $NCount = 0; my $PCount = 0; my $QCount = 0; my $RCount = 0;
my $SCount = 0; my $TCount = 0; my $VCount = 0; my $WCount = 0; my $YCount = 0;

# the count of each amino acid in each protein
my $totalACount = 0; my $totalCCount = 0; my $totalDCount = 0; my $totalECount = 0;
my $totalFCount = 0; my $totalGCount = 0; my $totalHCount = 0; my $totalICount = 0;
my $totalKCount = 0; my $totalLCount = 0; my $totalMCount = 0; my $totalNCount = 0;
my $totalPCount = 0; my $totalQCount = 0; my $totalRCount = 0; my $totalSCount = 0;
my $totalTCount = 0; my $totalVCount = 0; my $totalWCount = 0; my $totalYCount = 0;

# the count of each amino acid in each protein
my $Aexists = 0; my $Cexists = 0; my $Dexists = 0; my $Eexists = 0; my $Fexists = 0;
my $Gexists = 0; my $Hexists = 0; my $Iexists = 0; my $Kexists = 0; my $Lexists = 0;
my $Mexists = 0; my $Nexists = 0; my $Pexists = 0; my $Qexists = 0; my $Rexists = 0;
my $Sexists = 0; my $Texists = 0; my $Vexists = 0; my $Wexists = 0; my $Yexists = 0;

my $A; my $C; my $D; my $E; my $F; my $G; my $H; my $I; my $K; my $L; my $M;
my $N; my $P; my $Q; my $R; my $S; my $T; my $V; my $W; my $Y;
my $nbr; my $oldpeptide;

# if $opt_R is defined use these variable for average mass and isotopic mass
# isotopic masses of 20 amino acids with the incomplete modifications
my $mass_aZ = 71.03711; my $mass_cZ = 103.00919; my $mass_dZ = 115.02694; my $mass_eZ = 129.04259; my $mass_fZ = 147.06841;
my $mass_hZ = 137.05891; my $mass_iZ = 113.08406; my $mass_kZ = 128.09496; my $mass_lZ = 113.08406; my $mass_mZ = 131.04049;
my $mass_nZ = 114.04293; my $mass_pZ = 97.05276; my $mass_qZ = 128.05858; my $mass_rZ = 156.10111; my $mass_sZ = 87.03203;
my $mass_gZ = 57.02146; my $mass_tZ = 101.04768; my $mass_vZ = 99.06841; my $mass_wZ= 186.07931; my $mass_yZ = 163.06333;

# average masses of 20 amino acids with the incomplete modifications
my $amass_aZ = 71.0788; my $amass_cZ = 103.1388; my $amass_dZ = 115.0866; my $amass_eZ = 129.1155; my $amass_fZ = 147.1766;
my $amass_hZ = 137.1411; my $amass_iZ = 113.1594; my $amass_kZ = 128.1741; my $amass_lZ = 113.1411; my $amass_mZ = 131.1926;
my $amass_nZ = 114.1038; my $amass_pZ = 97.1167; my $amass_qZ = 128.1307; my $amass_rZ = 156.1875; my $amass_sZ = 87.0782;
my $amass_gZ = 57.0519; my $amass_tZ = 101.1051; my $amass_yZ = 163.1760; my $amass_vZ = 99.1326 ; my $amass_wZ = 186.2132;

# the number of acidic and basic residues in the proteome
my $acidic_count = 0;
my $basic_count = 0;
my $polar_count = 0;
my $nonpolar_count = 0;

my $prot_acid_count = 0;
my $prot_basic_count = 0;
my $prot_polar_count = 0;
my $prot_nonpolar_count = 0;
my $prot_iso_mass = 0;
my $prot_ave_mass = 0;

# this hash holds the redundancy for each peptide in the protein
my %redun_hash = ();
my @counter_array;
my $seq;
my $num = 0;

#the missed cleavages by default
my $missed_cleavage =0;
my $peptide_length = 2;

# variables used to edit cleavage

my $totalavemass=0; # initialization of masses
my $totalisomass = 0; # initialization of masses
my $totalisoEpoint = 0;

my $protein_name; # vairable that stores the name of the protein
my $protein; # each protein in the file
my $protein1; # the protein being separated into peptides
my $numpep =0; # initialization of number of peptides in each protein
my $peptide;
my @breakAt; # stores the cleavages
my @breakAt2;
my @first_nums;
my @second_nums;
my @positions;
my @joined;
my $key;
my $input_file; # name of the input fasta file
my $range;
my %peptide_hash; # keys are the range (position) of the peptide in the file

my @amino_acid_protein; # this hash keeps track of counts of amino acids per protein
# this is a multi dimensional array
my @aminoacids_count_protein;

# this has keeps track of counts of amino acids per peptide this is a multi
# dimensional array. The zeroth column of this array stores all 20 amino
# acids. Every row also stores the MAX amount of columns to keep a count of
# proteins that has peptides with 0..MAX-1 amount of given amino acid
my @amino_acid_peptide;
# user can change the MAX to any number then like
use constant MAX => 20;

my @aminoacids_count_peptide;

my $totalPeptides; # this variable stores the total number of peptides in the proteome
my $totalProteins; # this variable stores the total number of proteins in the proteome

my @isoMass_protein; # array to store isoMass of each protein
my @avgMass_protein; # array to store the avgMass of each protein
my @len_protein; # array to store the lengths of each protein

my @peptides_protein; # keeps track of peptide length for each protein

my @AA_perProtein; # keeps all the amino acid per protein

my @isoMass_peptide; # array to store isoMass of each peptide
my @avgMass_peptide; # array to store the avgMass of each peptide
my @len_peptide; # array to store the lengths of each peptide
my @charge_peptide; # array to store the charge of every peptide

my @hydrophobicity_peptide;#holds the hydrophobicity values for each peptide

# this is for proteins
my $totalIsoMass; # keeps the total IsoMass
my $totalAvgMass; # keeps the total AvgMass
my $totalLen;
my $totalPepds; # stores the peptides for each protein
my $totalHDPcity; # stores the hydrophobicity for each pepetide

my @amino_acids; # stores teh amino acids that are specified in residue

my $user_input_commandline=join(@ARGV," "); # the commandline arugment typed by the user

my $k; my $j; my $f; my $s;
$totalPeptides = 0;
$totalProteins = 0;

# this is for proteins
$totalIsoMass = 0;
$totalAvgMass = 0;
$totalLen = 0;
$totalPepds = 0;

my $display_redun=0;

use Getopt::Std;
# options to run the program
use vars qw($opt_i $opt_d $opt_a $opt_c $opt_s $opt_S $opt_T $opt_Y $opt_R $opt_W $opt_I $opt_M
$opt_C $opt_N $opt_G $opt_L $opt_H $opt_h );

$opt_i=""; # input file name
#$opt_d = 1; # simple output file
#$opt_a = 1; # annotated output file
$opt_s = 1; # summary file
# $opt_c, # cleavage criteria(AA's seperated by commas)
# $opt_S,# Phosphorylation on Ser (S)
# $opt_T,# Phosphorylation on Thr (T)
# $opt_Y,# Phosphorylation on Tyr (Y)
# $opt_R, # custom modification residue
# $opt_W, # custom modification weight differential
# $opt_I, # ICAT modification on Cys
# $opt_M, # MCAT modification on Arg Lys
# $opt_C,# peptide modificaiton:complete
# $opt_N,# peptide modificaiton:incomplete

# $opt_G,# number of missed cleavages
# $opt_L, # minimum peptide length

# $opt_P,# include only modified peptides in output
# $opt_H,# display redundant peptides

$opt_h=0; # help
# get the processing options

#***********************USER DEFINED VALUES ************************

# specifies the option user enters in.
if ( ! getopts('i:dasc:STYR:W:IMCNG:L:Hh')) {
error_msg();
exit;
}
$user_input_commandline= $Getopt::Std;

print "$user_input_commandline\n";

checkArgs();
sub error_msg{
print STDERR "USAGE: $0 \n",
"\t[-i input_file_name-protein sequence in fasta format]\n",
"\t[-d ] creates simple.txt output file\n",
"\t[-a ] creates annotated outputfile\n",
"\t[-c cleavage_citeria(,cleavage_criteria...)]\n",
"\t[-s ] creates summary file\n",
"\t[-R custom modification residue value]\n",
"\t[-W custom modification weight differential]\n",
"\t[-I ] ICAT modification on Cys\n",
"\t[-M ] MCAT modification on Arg Lys\n",
"\t[-L minimum peptide length]\n",
"\t[-S ] Phosphorylation on Ser (S)\n",
"\t[-T ] Phosphorylation on Thr (T)\n",
"\t[-Y ] Phosphorylation on Tyr (Y)\n",
"\t[-C ] peptide modificaiton:complete\n",
"\t[-N ] peptide modificaiton:incomplete\n",
"\t[-G number of missed cleavages]\n",
"\t[-H ] display redundant peptides\n",
"\t[-h ] help\n\n";
}

# take care of the commandline arguments
sub checkArgs{
# **********$opt_h is the help option*********************#
if($opt_h ==1){
print "\n\nProtein Cleavage and Classification Options\n";
print "===========================================\n\n";
print "Options:\n";
print " [-i input_file_name-protein sequence in fasta format]]\n";
print " [-d ] creates simple.txt output file\n";
print " [-a ] creates annotated outputfile\n";
print " [-c cleavage_citeria(,cleavage_criteria...)]\n";
print " [-s ] creates summary file\n";
print " [-R custom modification residue value]\n";
print " [-W custom modification weight differential]\n";
print " [-I ] ICAT modification on Cys\n";
print " [-M ] MCAT modification on Arg Lys\n";
print " [-L minimum peptide length]\n";
print " [-S ] Phosphorylation on Ser (S)\n";
print " [-T ] Phosphorylation on Thr (T)\n";
print " [-Y ] Phosphorylation on Tyr (Y)\n";
print " [-C ] peptide modificaiton:complete\n";
print " [-N ] peptide modificaiton:incomplete\n";
print " [-G number of missed cleavages]\n";
print " [-H ] display redundant peptides\n";
print " [-h ] help\n";
exit;
}
# *******opt_i input file name ********
if ($opt_i=~ /^$/){ # user must specify a valid input file name
error_msg();
exit;
}# check whether input file exists, readable and it is a non empty file
elsif(-e $opt_i && -r $opt_i && -s $opt_i){
my $c = 0;
open(INPUT_FASTA, "<$opt_i") || die " Cannot open file $opt_i\n";
open(FASTA, "<$opt_i") || die " Cannot open file $opt_i\n";
# check whether this file is fasta formated
while(<FASTA>){
if($_ =~ /^>/ && $c ==0){
close(FASTA);

last;
}
else{
print "Not a valid FASTA file. Please re-run the program with a valid FASTA file\n";
close(FASTA);
exit;
}
} #end while

$input_file = $opt_i;
# takes away all the letters followed by the . for the appending for the outputfile
$opt_i =~ s/(\..*)$//;
} #end else if
else{
print "$opt_i doesn't exist,no read access or no data in the file\n";
exit(0);
}

# *******opt_c cleavage criteria ********
if ($opt_c =~ /^$/){ # user must specify a valid cleavage criteria
print STDERR "You must enter a valid cleavage criteria\n";
error_msg();
exit;
}

my $cleavage = $opt_c;

my $builder = "";
my $builder2 = "";
my $builder3 = "";
my $tryp_flag1 = 0;
my $tryp_flag2 = 0;
my @AAs = ("A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y");
$cleavage =~ tr/a-z/A-Z/;
if ($cleavage =~ /[^A-Z,]/) {
print STDERR "You must enter a valid cleavage criteria\n";
error_msg();
exit;
}

elsif ($cleavage =~ /[A-Z,]/){
my $temp_cleave = $cleavage;
my @splices = split(/,/,$temp_cleave);
for (my $i=0; $i<=$#splices; $i++) {
if ($splices[$i]=~/Z/) {
for ($j=0; $j<=$#AAs; $j++) {
my $wildcard = $splices[$i];
$wildcard =~ s/Z/$AAs[$j]/;
$wildcard = $wildcard.",";
$builder = $builder.$wildcard;
}
}
if ($splices[$i] =~ /TRYPSIN/) {
splice(@splices, $i, 1);
$tryp_flag1 = 1;

}
}

if ($tryp_flag1 == 1) {
$builder2 =
"KXA,KXC,KXD,KXE,KXF,KXG,KXH,KXI,KXK,KXL,KXM,KXN,KXQ,KXR,KXS,KXT,KXV,KXY,KXW,RXA,RXC,RXD,RXE,RXF,RXG,RXH,RXI,RXK,RXL,RXM,RXN,RXQ,RXR,RXS,RXT,RXV,RXY,RXW";
for (my $k = 0; $k <= $#splices; $k++) {
$builder3 = $builder3.$splices[$k].",";

}
$cleavage = $builder3.$builder2;
}
else {
$cleavage = $cleavage.",";
$cleavage = $cleavage.$builder;
}
if($cleavage=~/^,/){ # removes the leading commas from cleavage
$cleavage =~ s/^,+//;
}
if($cleavage=~/,$/){ # removes the trailing commas from cleavage
$cleavage =~ s/\,+$//;
}
@breakAt = split(/,/,$cleavage);

for ($k=0; $k<=$#breakAt; $k++) {
if ($breakAt[$k] =~ /Z/){
splice(@breakAt, $k, 1);
$k = $k-1;
}
}

for (my $a=0; $a<=$#breakAt; $a++) {
push(@breakAt2, $breakAt[$a]);
}

for my $j (0 .. $#breakAt){
$breakAt[$j] =~ s/X//g;
}
}

my $simple_file_name;
my $annotated_file_name;
my $summary_file_name;
# *******opt_d creates simple.txt output file ********
if ($opt_d==1){
# name of the simple.txt output file
$simple_file_name = "$opt_i"."_simple.txt";
open (SIMPLE, ">$simple_file_name")|| die "Cannot open $simple_file_name $!";

}

# *******opt_a creates annotated outputfile ********
if ($opt_a==1){
# name of the annotated.txt output file
$annotated_file_name = "$opt_i"."_annotated.txt";
open (ANNOTATED, ">$annotated_file_name")|| die "Cannot open $annotated_file_name $!";
}

# *******opt_s creates summary file ********
if ($opt_s==1){ # if user specify for the summary then the summary will be written
# name of the summary output file
$summary_file_name = "summary.html";
#$summary_file_name = "/home/thanuja/public_html/proteogest/summary.html";
open (SUMMARY, ">$summary_file_name")|| die "Cannot open $summary_file_name $!";
}
# *******$opt_L minimum peptide length. default is 2********
# user specifies the miminum length of peptides to include in the
# analysis
if (defined($opt_L) && $opt_L>0){
$peptide_length = $opt_L;
}

# *******$opt_G number of missed cleavages default is 0********
if (defined($opt_G) && $opt_G >0){
$missed_cleavage = $opt_G;
}

# *******$opt_S Phosphorylation on Ser (S)********
if($opt_S ==1 && $opt_N==0){
# add 80 to Ser
$mass_S = $mass_S + 80.00;
$amass_S = $amass_S + 80.00;
}

# *******$opt_T Phosphorylation on Thr (T)********
if($opt_T ==1 && $opt_N==0){
# add 80 to Thr
$mass_T = $mass_T + 80.00;
$amass_T = $amass_T + 80.00;
}

# *******$opt_Y Phosphorylation on Tyr (Y)********
if($opt_Y ==1 && $opt_N==0){
# add 80 to Tyr
$mass_Y = $mass_Y + 80.00;
$amass_Y = $amass_Y + 80.00;
}

# *******$opt_I ICAT modification on Cys (C)********
#if ($opt_I==1 && $opt_N==0){
# $mass_C = $mass_C + 8.00;
# $amass_C = $amass_C + 8.00;

# *******$opt_M MCAT modification on Arg Lys ********
if ($opt_M==1 && $opt_N==0){
$mass_K = $mass_K + 42.00;
$amass_K = $amass_K + 42.00;
}

# *******$opt_R custom modification residue value ********
if (defined($opt_R) && ($opt_R =~ /^$/)){ # user must specify a valid positive integer
print STDERR " Custom Modification Error: Please specify a valid Amino Acid and re-run the program\n";
exit;
}
# translate small case to upper case
$opt_R =~ tr/a-z/A-Z/;

# if $opt_R is not a letter then send an error message
if(defined($opt_R) && ($opt_R =~/[^A-Z,]/)){
print STDERR " Custom Modification Error: Please specify a valid Amino Acid and re-run the program\n";
exit;
}
# check whether it is a valid amino acid
elsif(defined($opt_R) && ($opt_R =~/B/ || $opt_R =~/J/ || $opt_R =~/O/ || $opt_R =~/U/ || $opt_R =~/X/ || $opt_R =~/Z/)){
print STDERR " Custom Modification Error: Please specify a valid Amino Acid and re-run the program\n";
exit;
}

# *******$opt_W custom modification weight differential********
if (defined($opt_W) && ($opt_R) && ($opt_W =~ /^$/ )){ # user must specify a valid positive real integer
print STDERR " Custom Modification Error: Please specify Weight differential and re-run the program\n";
exit;
}
elsif($opt_W !~ /^-?\d+\.?\d*$/ && $opt_R){
# check whether the weight is valid positive real integer
print STDERR "Custom Modification Error: Invalid weight differential: $opt_W\nPlease re-run the program\n";
exit;
}

@amino_acids = split(/,/,$opt_R);
foreach my $amino_acid (@amino_acids){
if(length($amino_acid) >1){
print STDERR " Please specify Amino Acid seperated by commas (eg: R,T,A) and re-run the program\n";
exit;
}
}
# now add the given weight to the given AA
if($opt_R && $opt_W && $opt_N!=1){
sleep 5;
foreach my $amino_acid (@amino_acids){
if($amino_acid =~ /^A$/){
$mass_A = $mass_A + $opt_W;
$amass_A = $amass_A + $opt_W;
}
if($amino_acid =~ /^C$/){
$mass_C = $mass_C + $opt_W;
$amass_C = $amass_C+ $opt_W;
}
if($amino_acid =~ /^D$/){
$mass_D = $mass_D + $opt_W;
$amass_D = $amass_D+ $opt_W;
}
if($amino_acid =~ /^E$/){
$mass_E = $mass_E + $opt_W;
$amass_E = $amass_E+ $opt_W;
}
if($amino_acid =~ /^F$/){
$mass_F = $mass_F + $opt_W;
$amass_F = $amass_F+ $opt_W;
}
if($amino_acid =~ /^G$/){
$mass_G = $mass_G + $opt_W;
$amass_G = $amass_G+ $opt_W;
}
if($amino_acid =~ /^H$/){
$mass_H = $mass_H + $opt_W;
$amass_H = $amass_H+ $opt_W;
}
if($amino_acid =~ /^I$/){
$mass_I = $mass_I + $opt_W;
$amass_I = $amass_I+ $opt_W;
}
if($amino_acid =~ /^K$/){
$mass_K = $mass_K + $opt_W;
$amass_K = $amass_K+ $opt_W;
}
if($amino_acid =~ /^L$/){
$mass_L = $mass_L + $opt_W;
$amass_L = $amass_L+ $opt_W;
}
if($amino_acid =~ /^M$/){
$mass_M = $mass_M + $opt_W;
$amass_M = $amass_M+ $opt_W;
}
if($amino_acid =~ /^N$/){
$mass_N = $mass_N + $opt_W;
$amass_N = $amass_N+ $opt_W;
}
if($amino_acid =~ /^P$/){
$mass_P = $mass_P + $opt_W;
$amass_P = $amass_P+ $opt_W;
}
if($amino_acid =~ /^Q$/){
$mass_Q = $mass_Q + $opt_W;
$amass_Q = $amass_Q+ $opt_W;
}
if($amino_acid =~ /^R$/){
$mass_R = $mass_R + $opt_W;
$amass_R = $amass_R+ $opt_W;
}
if($amino_acid =~ /^S$/){
$mass_S = $mass_S + $opt_W;
$amass_S = $amass_S+ $opt_W;
}
if($amino_acid =~ /^T$/){
$mass_T = $mass_T + $opt_W;
$amass_T = $amass_T+ $opt_W;
}
if($amino_acid =~ /^V$/){
$mass_V = $mass_V + $opt_W;
$amass_V = $amass_V+ $opt_W;
}
if($amino_acid =~ /^W$/){
$mass_W = $mass_W + $opt_W;
$amass_W = $amass_W+ $opt_W;

}
if($amino_acid =~ /^Y$/){
$mass_Y = $mass_Y + $opt_W;
$amass_Y = $amass_Y+ $opt_W;
}
} # end for
} # end if

# *******$opt_N peptide modificaiton:incomplete ********
if ($opt_N==1){
# tricky
}

# Now display to STDOUT the options selected
print "Protein Cleavage and Classification\n";
print "===================================\n\n";
print "Options:\n";
print " input FASTA file:\t $input_file\n";
print " cleavage criteria:\t $cleavage\n";
if ($opt_d==1){print " simple output file:\t $simple_file_name\n";}
if ($opt_a==1){print " annotated output file:\t$annotated_file_name\n";}
if ($opt_s==1){print " summary file: $summary_file_name\n";}
print " minimum peptide length:\t$peptide_length\n";
print " number of missed cleavages:\t$missed_cleavage\n";
if ($opt_S ==1){print " Phosphorylation on Ser (S)\n";}
if ($opt_T ==1){print " Phosphorylation on Thr (T)\n";}
if ($opt_Y ==1){print " Phosphorylation on Tyr (Y)\n";}
if ($opt_I ==1){print " ICAT modification on Cys (C)\n";}
if ($opt_M ==1){print " MCAT modification on Arg Lys\n";}
if ($opt_N ==1){print " peptide modificaiton:incomplete\n";}
if ($opt_H ==1){print " display redundant peptides\n";}
if (defined($opt_R)){print " custom modification residue value on : $opt_R\n";}
if (defined($opt_W)){print " custom modification weight differential is: $opt_W\n";}
#*******************************************************************
print "\n\n";
print "Running proteogest..\n";
print "\n\n";
} # end sub

open(PEPTIDES, ">peptides.txt") || die "Cannot open peptides.txt $!";

# formatting the output for output file1
format SIMPLE =
@####### @<<<<<<<<<< @#####.##### @#####.##### @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
$numpep, $range, $totalisomass, $totalavemass, $peptide
.

# formatting the output for output file2
format ANNOTATED=
@>>>>>>> @###@###@###@###@###@###@###@###@###@###@###@###@###@###@###@###@###@###@###@### @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
$nbr, $A, $C, $D, $E, $F, $G, $H, $I, $K, $L, $M, $N, $P, $Q, $R, $S, $T, $V, $W, $Y, $oldpeptide
.

# formatting the output for standard out
format STDOUT=
@###@###@###@###@###@###@###@###@###@###@###@###@###@###@###@###@###@###@###@###
$A, $C, $D, $E, $F, $G, $H, $I, $K, $L, $M, $N, $P, $Q, $R, $S, $T, $V, $W, $Y
.

# formatting the output for standard out
format PEPTIDES=
@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
$peptide
.

# this subroutine gets called to initialize the multi-dimensional array to
# store peptide statistics analysis displayed in the summary file.
initilizePeptideStatisticsArray();

# calls the main subroutine
main();

# sub routine::comparison function
# compares the lengths of 2 arrays
sub by_length{
$f = length $a;
$s = length $b;
$s<=>$f;
}

# reinitializes the variables
sub reinitialize{
$totalIsoMass = 0;
$totalAvgMass = 0;
$totalLen = 0;
$totalPepds = 0;

undef @aminoacids_count_protein;

$totalACount = 0; $totalCCount = 0; $totalDCount = 0; $totalECount = 0;
$totalFCount = 0; $totalGCount = 0; $totalHCount = 0; $totalICount = 0;
$totalKCount = 0; $totalLCount = 0; $totalMCount = 0; $totalNCount = 0;
$totalPCount = 0; $totalQCount = 0; $totalRCount = 0; $totalSCount = 0;
$totalTCount = 0; $totalVCount = 0; $totalWCount = 0; $totalYCount = 0;

$Aexists = 0; $Cexists = 0; $Dexists = 0; $Eexists = 0; $Fexists = 0;
$Gexists = 0; $Hexists = 0; $Iexists = 0; $Kexists = 0; $Lexists = 0;
$Mexists = 0; $Nexists = 0; $Pexists = 0; $Qexists = 0; $Rexists = 0;
$Sexists = 0; $Texists = 0; $Vexists = 0; $Wexists = 0; $Yexists = 0;

$prot_acid_count = 0;
$prot_basic_count = 0;
$prot_polar_count = 0;
$prot_nonpolar_count = 0;
$prot_iso_mass = 0;
$prot_ave_mass = 0;
}

# This subroutine takes care of analyzing each peptide. This subroutine takes
# care of calculting the total isotopic and average mass of each peptide. It
# also takes care of counting amino acids present in each peptide.
sub analyze_peptide{
my ($p) = $_[0];
my ($x) = $_[1];
my ($r) = $_[2];
my $complete =$_[3];
# get the value of the pointer
$peptide = $$p;
$numpep = $$x;
$range = $$r;

# keep this peptide for the redundancy calculations for the summary
# do only if the user requests
if($opt_H==1 && $complete==1){
if(exists $redun_hash{$peptide}){
if ($peptide !~ "par"){
my @protein = @{$redun_hash{$peptide}};
push(@protein,$protein_name);
$redun_hash{$peptide} = [@protein];
undef @protein;
$display_redun =1;
}

}
else{
if (($peptide !~ "par") && ($peptide !~
/[a-z]/)) {
my @protein;
push(@protein,$protein_name);

$redun_hash{$peptide} = [@protein];
undef @protein;
}
}
}

# initializes the counts of each amino acid back to 0, for the next peptide
$ACount = 0; $CCount = 0; $DCount = 0; $ECount = 0; $FCount = 0;
$GCount = 0; $HCount = 0; $ICount = 0; $KCount = 0; $LCount = 0;
$MCount = 0; $NCount = 0; $PCount = 0; $QCount = 0; $RCount = 0;
$SCount = 0; $TCount = 0; $VCount = 0; $WCount = 0; $YCount = 0;
# calculates the average an isotopic and average mass of each peptide
for($j = 0; $j<=(length($peptide)); $j++){
if (substr($peptide, $j, 2) eq "aZ"){
$j= $j+1;
$totalavemass = $totalavemass + $amass_aZ +$opt_W;
$totalisomass = $totalisomass + $mass_aZ + $opt_W;
}
elsif (substr($peptide, $j, 2) eq "cZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_cZ +$opt_W ;
$totalisomass = $totalisomass + $mass_cZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "dZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_dZ +$opt_W;
$totalisomass = $totalisomass + $mass_dZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "eZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_eZ +$opt_W;
$totalisomass = $totalisomass + $mass_eZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "fZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_fZ +$opt_W;
$totalisomass = $totalisomass + $mass_fZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "gZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_gZ +$opt_W;
$totalisomass = $totalisomass + $mass_gZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "hZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_hZ +$opt_W;
$totalisomass = $totalisomass + $mass_hZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "iZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_iZ +$opt_W;
$totalisomass = $totalisomass + $mass_iZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "kZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_kZ +$opt_W;
$totalisomass = $totalisomass + $mass_kZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "lZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_lZ +$opt_W;
$totalisomass = $totalisomass + $mass_lZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "mZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_mZ +$opt_W;
$totalisomass = $totalisomass + $mass_mZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "nZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_nZ +$opt_W;
$totalisomass = $totalisomass + $mass_nZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "pZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_pZ +$opt_W;
$totalisomass = $totalisomass + $mass_pZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "qZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_qZ +$opt_W;
$totalisomass = $totalisomass + $mass_qZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "rZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_rZ +$opt_W;
$totalisomass = $totalisomass + $mass_rZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "sZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_sZ +$opt_W;
$totalisomass = $totalisomass + $mass_sZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "tZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_tZ +$opt_W;
$totalisomass = $totalisomass + $mass_tZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "vZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_vZ +$opt_W;
$totalisomass = $totalisomass + $mass_vZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "wZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_wZ +$opt_W;
$totalisomass = $totalisomass + $mass_wZ +$opt_W;
}
elsif (substr($peptide, $j, 2) eq "yZ"){
$j = $j+1;
$totalavemass = $totalavemass + $amass_yZ +$opt_W;
$totalisomass = $totalisomass + $mass_yZ +$opt_W;
}
elsif (substr($peptide, $j, 1) eq "A"){
$totalavemass = $totalavemass + $amass_A;
$totalisomass = $totalisomass + $mass_A;
$ACount++;
$totalACount++;
$prot_iso_mass += $mass_a_P;
$prot_ave_mass += $amass_a_P;
$aminoacids_count_protein[0]="A="."$totalACount";
$nonpolar_count++;
$prot_nonpolar_count++;
}
elsif (substr($peptide, $j, 1) eq "C"){
$totalavemass = $totalavemass + $amass_C;
$totalisomass = $totalisomass + $mass_C;
$CCount++;
$totalCCount++;
$prot_iso_mass += $mass_c_P;
$prot_ave_mass += $amass_c_P;
$aminoacids_count_protein[1]="C="."$totalCCount";
$nonpolar_count++;
#$prot_nonpolar_count++;
}
elsif (substr($peptide, $j, 1) eq "D"){
$totalavemass = $totalavemass + $amass_D;
$totalisomass = $totalisomass + $mass_D;
$DCount++;
$totalDCount++;
$acidic_count++;
$prot_iso_mass += $mass_d_P;
$prot_ave_mass += $amass_d_P;
$aminoacids_count_protein[2]="D="."$totalDCount";
$prot_acid_count++;
$polar_count++;
$prot_polar_count++;
}
elsif (substr($peptide, $j, 1) eq "E"){
$totalavemass = $totalavemass + $amass_E;
$totalisomass = $totalisomass + $mass_E;
$ECount++;
$totalECount++;
$acidic_count++;
$prot_iso_mass += $mass_e_P;
$prot_ave_mass += $amass_e_P;
$aminoacids_count_protein[3]="E="."$totalECount";
$prot_acid_count++;
$polar_count++;
$prot_polar_count++;
}
elsif (substr($peptide, $j, 1) eq "F"){
$totalavemass = $totalavemass + $amass_F;
$totalisomass = $totalisomass + $mass_F;
$FCount++;
$totalFCount++;
$prot_iso_mass += $mass_f_P;
$prot_ave_mass += $amass_f_P;
$aminoacids_count_protein[4]="F="."$totalFCount";
$nonpolar_count++;
$prot_nonpolar_count++;
}
elsif (substr($peptide, $j, 1) eq "G"){
$totalavemass = $totalavemass + $amass_G;
$totalisomass = $totalisomass + $mass_G;
$GCount++;
$totalGCount++;
$prot_iso_mass += $mass_g_P;
$prot_ave_mass += $amass_g_P;
$aminoacids_count_protein[5]="G="."$totalGCount";
$nonpolar_count++;
$prot_nonpolar_count++;
}
elsif (substr($peptide, $j, 1) eq "H"){
$totalavemass = $totalavemass + $amass_H;
$totalisomass = $totalisomass + $mass_H;
$HCount++;
$totalHCount++;
$prot_iso_mass += $mass_h_P;
$prot_ave_mass += $amass_h_P;
$aminoacids_count_protein[6]="H="."$totalHCount";
$basic_count++;
$prot_basic_count++;
$polar_count++;
$prot_polar_count++;
}
elsif (substr($peptide, $j, 1) eq "I"){
$totalavemass = $totalavemass + $amass_I;
$totalisomass = $totalisomass + $mass_I;
$ICount++;
$totalICount++;
$prot_iso_mass += $mass_i_P;
$prot_ave_mass += $amass_i_P;
$aminoacids_count_protein[7]="I="."$totalICount";
$nonpolar_count++;
$prot_nonpolar_count++;
}
elsif (substr($peptide, $j, 1) eq "K"){
$totalavemass = $totalavemass + $amass_K;
$totalisomass = $totalisomass + $mass_K;
$KCount++;
$totalKCount++;
$prot_iso_mass += $mass_k_P;
$prot_ave_mass += $amass_k_P;
$aminoacids_count_protein[8]="K="."$totalKCount";
$basic_count++;
$prot_basic_count++;
$polar_count++;
$prot_polar_count++;
}
elsif (substr($peptide, $j, 1) eq "L"){
$totalavemass = $totalavemass + $amass_L;
$totalisomass = $totalisomass + $mass_L;
$LCount++;
$totalLCount++;
$prot_iso_mass += $mass_l_P;
$prot_ave_mass += $amass_l_P;
$aminoacids_count_protein[9]="L="."$totalLCount";
$nonpolar_count++;
$prot_nonpolar_count++;
}
elsif (substr($peptide, $j, 1) eq "M"){
$totalavemass = $totalavemass + $amass_M;
$totalisomass = $totalisomass + $mass_M;
$MCount++;
$totalMCount++;
$prot_iso_mass += $mass_m_P;
$prot_ave_mass += $amass_m_P;
$aminoacids_count_protein[10]="M="."$totalMCount";
$nonpolar_count++;
$prot_nonpolar_count++;
}
elsif (substr($peptide, $j, 1) eq "N"){
$totalavemass = $totalavemass + $amass_N;
$totalisomass = $totalisomass + $mass_N;
$NCount++;
$totalNCount++;
$prot_iso_mass += $mass_n_P;
$prot_ave_mass += $amass_n_P;
$aminoacids_count_protein[11]="N="."$totalNCount";
$polar_count++;
$prot_polar_count++;
}
elsif (substr($peptide, $j, 1) eq "P"){
$totalavemass = $totalavemass + $amass_P;
$totalisomass = $totalisomass + $mass_P;
$PCount++;
$totalPCount++;
$prot_iso_mass += $mass_p_P;
$prot_ave_mass += $amass_p_P;
$aminoacids_count_protein[12]="P="."$totalPCount";
$nonpolar_count++;
$prot_nonpolar_count++;
}
elsif (substr($peptide, $j, 1) eq "Q"){
$totalavemass = $totalavemass + $amass_Q;
$totalisomass = $totalisomass + $mass_Q;
$QCount++;
$totalQCount++;
$prot_iso_mass += $mass_q_P;
$prot_ave_mass += $amass_q_P;
$aminoacids_count_protein[13]="Q="."$totalQCount";
$polar_count++;
$prot_polar_count++;
}

elsif (substr($peptide, $j, 1) eq "R"){
$totalavemass = $totalavemass + $amass_R;
$totalisomass = $totalisomass + $mass_R;
$RCount++;
$totalRCount++;
$prot_iso_mass += $mass_r_P;
$prot_ave_mass += $amass_r_P;
$aminoacids_count_protein[14]="R="."$totalRCount";
$basic_count++;
$prot_basic_count++;
$polar_count++;
$prot_polar_count++;
}
elsif (substr($peptide, $j, 1) eq "S"){
$totalavemass = $totalavemass + $amass_S;
$totalisomass = $totalisomass + $mass_S;
$SCount++;
$totalSCount++;
$prot_iso_mass += $mass_s_P;
$prot_ave_mass += $amass_s_P;
$aminoacids_count_protein[15]="S="."$totalSCount";
$polar_count++;
$prot_polar_count++;
}
elsif (substr($peptide, $j, 1) eq "T"){
$totalavemass = $totalavemass + $amass_T;
$totalisomass = $totalisomass + $mass_T;
$TCount++;
$totalTCount++;
$prot_iso_mass += $mass_t_P;
$prot_ave_mass += $amass_t_P;
$aminoacids_count_protein[16]="T="."$totalTCount";
$polar_count++;
$prot_polar_count++;
}
elsif (substr($peptide, $j, 1) eq "V"){
$totalavemass = $totalavemass + $amass_V;
$totalisomass = $totalisomass + $mass_V;
$VCount++;
$totalVCount++;
$prot_iso_mass += $mass_v_P;
$prot_ave_mass += $amass_v_P;
$aminoacids_count_protein[17]="V="."$totalVCount";
$nonpolar_count++;
$prot_nonpolar_count++;
}
elsif (substr($peptide, $j, 1) eq "W"){
$totalavemass = $totalavemass + $amass_W;
$totalisomass = $totalisomass + $mass_W;
$WCount++;
$totalWCount++;
$prot_iso_mass += $mass_w_P;
$prot_ave_mass += $amass_w_P;
$aminoacids_count_protein[18]="W="."$totalWCount";
$polar_count++;
$prot_polar_count++;
}
elsif (substr($peptide, $j, 1) eq "Y"){
$totalavemass = $totalavemass + $amass_Y;
$totalisomass = $totalisomass + $mass_Y;
$YCount++;
$totalYCount++;
$prot_iso_mass += $mass_y_P;
$prot_ave_mass += $amass_y_P;
$aminoacids_count_protein[19]="Y="."$totalYCount";
$polar_count++;
$prot_polar_count++;
}

elsif (substr($peptide, $j, 1) eq "a"){
$totalavemass = $totalavemass + $amass_a;
$totalisomass = $totalisomass + $mass_a;
}
elsif (substr($peptide, $j, 1) eq "c"){
$totalavemass = $totalavemass + $amass_c;
$totalisomass = $totalisomass + $mass_c;
}
elsif (substr($peptide, $j, 1) eq "d"){
$totalavemass = $totalavemass + $amass_d;
$totalisomass = $totalisomass + $mass_d;
}
elsif (substr($peptide, $j, 1) eq "e"){
$totalavemass = $totalavemass + $amass_e;
$totalisomass = $totalisomass + $mass_e;
}
elsif (substr($peptide, $j, 1) eq "f"){
$totalavemass = $totalavemass + $amass_f;
$totalisomass = $totalisomass + $mass_f;
}
elsif (substr($peptide, $j, 1) eq "g"){
$totalavemass = $totalavemass + $amass_g;
$totalisomass = $totalisomass + $mass_g;
}
elsif (substr($peptide, $j, 1) eq "h"){
$totalavemass = $totalavemass + $amass_h;
$totalisomass = $totalisomass + $mass_h;
}
elsif (substr($peptide, $j, 1) eq "i"){
$totalavemass = $totalavemass + $amass_i;
$totalisomass = $totalisomass + $mass_i;
}
elsif (substr($peptide, $j, 1) eq "k"){
$totalavemass = $totalavemass + $amass_k;
$totalisomass = $totalisomass + $mass_k;
}
elsif (substr($peptide, $j, 1) eq "l"){
$totalavemass = $totalavemass + $amass_l;
$totalisomass = $totalisomass + $mass_l;
}
elsif (substr($peptide, $j, 1) eq "m"){
$totalavemass = $totalavemass + $amass_m;
$totalisomass = $totalisomass + $mass_m;
}
elsif (substr($peptide, $j, 1) eq "n"){
$totalavemass = $totalavemass + $amass_n;
$totalisomass = $totalisomass + $mass_n;
}
elsif (substr($peptide, $j, 1) eq "p"){
$totalavemass = $totalavemass + $amass_p;
$totalisomass = $totalisomass + $mass_p;
}
elsif (substr($peptide, $j, 1) eq "q"){
$totalavemass = $totalavemass + $amass_q;
$totalisomass = $totalisomass + $mass_q;
}

elsif (substr($peptide, $j, 1) eq "r"){
$totalavemass = $totalavemass + $amass_r;
$totalisomass = $totalisomass + $mass_r;
}
elsif (substr($peptide, $j, 1) eq "s"){
$totalavemass = $totalavemass + $amass_s;
$totalisomass = $totalisomass + $mass_s;
}
elsif (substr($peptide, $j, 1) eq "t"){
$totalavemass = $totalavemass + $amass_t;
$totalisomass = $totalisomass + $mass_t;
}
elsif (substr($peptide, $j, 1) eq "v"){
$totalavemass = $totalavemass + $amass_v;
$totalisomass = $totalisomass + $mass_v;
}
elsif (substr($peptide, $j, 1) eq "w"){
$totalavemass = $totalavemass + $amass_w;
$totalisomass = $totalisomass + $mass_w;
}
elsif (substr($peptide, $j, 1) eq "y"){
$totalavemass = $totalavemass + $amass_y;
$totalisomass = $totalisomass + $mass_y;
}
} # end for

# add the mass of water to each masses
if( $totalavemass >0){
$totalavemass = $totalavemass + WATER;
}
if( $totalisomass >0){
$totalisomass = $totalisomass + WATER;
}

# keeps track of the total masses for the proteins calculations
$totalIsoMass += $totalisomass;
$totalAvgMass += $totalavemass;

# do not add these calculation to the incoplete case
if($complete ==1){
$totalLen += length($peptide);
# this for each peptide calculations
push(@len_peptide,length($peptide));
push(@charge_peptide, charge_of_peptides(\$peptide));
}

# this for each peptide calculations
push(@isoMass_peptide,$totalisomass);
my $l = $#isoMass_peptide;
# for each peptide keep teh count of total average mass and total iso mass
push(@avgMass_peptide,$totalavemass);

if($opt_d==1){
# writes out all the information to the outputfile1
write(SIMPLE);
}
# reinitialization
$totalavemass = 0;
$totalisomass = 0;
$totalisoEpoint = 0;

$A = $ACount; $C = $CCount; $D = $DCount; $E = $ECount; $F = $FCount; $G = $GCount;
$H = $HCount; $I = $ICount; $K = $KCount; $L = $LCount; $M = $MCount; $N = $NCount;
$P = $PCount; $Q = $QCount; $R = $RCount; $S = $SCount; $T = $TCount; $V = $VCount;
$W = $WCount; $Y = $YCount;
$nbr = $numpep; $oldpeptide = $peptide;
if($opt_a==1){
# write the info to the output file
write(ANNOTATED);
}

if($complete==1){
if ($ACount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[0][$ACount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[0][$ACount+1] = "$val=T";
}
} # increments when there are one or more A's in each peptide
if ($CCount <= MAX) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[1][$CCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[1][$CCount+1] = "$val=T";
}
} # increments when there are one or more C's in each peptide
if ($DCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[2][$DCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[2][$DCount+1] = "$val=T";
}
}
if ($ECount <= MAX) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[3][$ECount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[3][$ECount+1] = "$val=T";
}
}
if ($FCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[4][$FCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[4][$FCount+1] = "$val=T";
}
}
if ($GCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[5][$GCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[5][$GCount+1] = "$val=T";
}
}
if ($HCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[6][$HCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[6][$HCount+1] = "$val=T";
}
}
if ($ICount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[7][$ICount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[7][$ICount+1] = "$val=T";
}
}
if ($KCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[8][$KCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[8][$KCount+1] = "$val=T";
}
}
if ($LCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[9][$LCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[9][$LCount+1] = "$val=T";
}
}
if ($MCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[10][$MCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[10][$MCount+1] = "$val=T";
}
}
if ($NCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[11][$NCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[11][$NCount+1] = "$val=T";
}
}
if ($PCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[12][$PCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[12][$PCount+1] = "$val=T";
}
}
if ($QCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[13][$QCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[13][$QCount+1] = "$val=T";
}
}
if ($RCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[14][$RCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[14][$RCount+1] = "$val=T";
}
}
if ($SCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[15][$SCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[15][$SCount+1] = "$val=T";
}
}
if ($TCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[16][$TCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[16][$TCount+1] = "$val=T";
}
}
if ($VCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[17][$VCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[17][$VCount+1] = "$val=T";
}
}
if ($WCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[18][$WCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[18][$WCount+1] = "$val=T";
}
}
if ($YCount <= MAX ) {
my ($val,$bool) = split(/=/,$amino_acid_peptide[19][$YCount+1]);
if($bool !~ /^T$/){
$val++;
$amino_acid_peptide[19][$YCount+1] = "$val=T";
}
}

# Determines how many peptides have at least one of each of the twenty amino acids
if ($ACount >= 1 ) {
$Aexists++;
} # increments when there are one or more A's in each peptide
if ($CCount >= 1 ) {
$Cexists++;
} # increments when there are one or more C's in each peptide
if ($DCount >= 1 ) {
$Dexists++;
}
if ($ECount >= 1 ) {
$Eexists++;
}
if ($FCount >= 1 ) {
$Fexists++;
}
if ($GCount >= 1 ) {
$Gexists++;
}
if ($HCount >= 1 ) {
$Hexists++;
}
if ($ICount >= 1 ) {
$Iexists++;
}
if ($KCount >= 1 ) {
$Kexists++;
}
if ($LCount >= 1 ) {
$Lexists++;
}
if ($MCount >= 1 ) {
$Mexists++;
}
if ($NCount >= 1 ) {
$Nexists++;
}
if ($PCount >= 1 ) {
$Pexists++;
}
if ($QCount >= 1 ) {
$Qexists++;
}
if ($RCount >= 1 ) {
$Rexists++;
}
if ($SCount >= 1 ) {
$Sexists++;
}
if ($TCount >= 1 ) {
$Texists++;
}
if ($VCount >= 1 ) {
$Vexists++;
}
if ($WCount >= 1 ) {
$Wexists++;
}
if ($YCount >= 1 ) {
$Yexists++;
}

# Calculating hydrophobicity
my $hydrophobicity = ($ACount*1.8) + ($CCount*0.04) + ($DCount*(-0.72)) + ($ECount*(-0.62)) + ($FCount*0.61)
+ ($GCount*0.16) + ($HCount*(-0.40)) + ($ICount*0.73) + ($KCount*(-1.1)) + ($LCount*0.53)
+ ($MCount*0.26) + ($NCount*(-0.64)) + ($PCount*(-0.07)) + ($QCount*(-0.69)) + ($RCount*(-1.8))
+ ($SCount*(-0.26)) + ($TCount*(-0.18)) + ($VCount*0.54) + ($WCount*0.37) + ($YCount*0.02);
push(@hydrophobicity_peptide, $hydrophobicity);
}# end if
} #end sub

# writes to the output file
sub print_final
{
$nbr = " "; $oldpeptide = " ";

print ANNOTATED "\n";
print ANNOTATED "\n";
print ANNOTATED " Amino Acid Representation \n";
print ANNOTATED " The number of peptides with at least one of each amino acid\n";
print ANNOTATED "\n";
print ANNOTATED " A C D E F G H I K L M N P Q R S T V W Y\n";
print ANNOTATED " ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n";
# print "\n";
# print " Amino Acid Representation \n";
# print " The number of peptides with at least one of each amino acid\n";
# print "\n";
# print " A C D E F G H I K L M N P Q R S T V W Y\n";
# print " ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n";
$A = $Aexists; $C = $Cexists; $D = $Dexists; $E = $Eexists; $F = $Fexists;
$G = $Gexists; $H = $Hexists; $I = $Iexists; $K = $Kexists; $L = $Lexists;
$M = $Mexists; $N = $Nexists; $P = $Pexists; $Q = $Qexists; $R = $Rexists;
$S = $Sexists; $T = $Texists; $V = $Vexists; $W = $Wexists; $Y = $Yexists;
#**************debug**********************
# write(STDOUT);
#*****************************************
write(ANNOTATED);
print ANNOTATED "\n";
print ANNOTATED "\n";
print ANNOTATED " Total Number of Each Amino Acid in the Protein \n";
print ANNOTATED "\n";
print ANNOTATED " A C D E F G H I K L M N P Q R S T V W Y\n";
print ANNOTATED " ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n";
# print "\n";
# print " Total Number of Each Amino Acid in the Protein \n";
# print "\n";
# print " A C D E F G H I K L M N P Q R S T V W Y\n";
# print " ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n";

$A = $totalACount; $C = $totalCCount; $D = $totalDCount; $E = $totalECount;
$F = $totalFCount; $G = $totalGCount; $H = $totalHCount; $I = $totalICount;
$K = $totalKCount; $L = $totalLCount; $M = $totalMCount; $N = $totalNCount;
$P = $totalPCount; $Q = $totalQCount; $R = $totalRCount; $S = $totalSCount;
$T = $totalTCount; $V = $totalVCount; $W = $totalWCount; $Y = $totalYCount;

write(ANNOTATED);
write(PEPTIDES);
#**************debug**********************
# write(STDOUT);
#*****************************************
print ANNOTATED "\n";
print ANNOTATED "\n";
print ANNOTATED "Number of acidic residues in protein: $prot_acid_count\n";
print ANNOTATED "Number of basic residues in protein: $prot_basic_count\n";
print ANNOTATED "Number of polar residues in protein: $prot_polar_count\n";
print ANNOTATED "Number of nonpolar residues in protein: $prot_nonpolar_count\n";
print ANNOTATED "Isotopic mass of protein: $prot_iso_mass\n";
print ANNOTATED "Average mass of protein: $prot_ave_mass\n";
print ANNOTATED "\n";
print ANNOTATED "\n";
print ANNOTATED "\n";
}

# this sub routine generates the missed clevages as well as the hash to
# store the postions of peptides in the input file
sub missedCleavage{
my ($peptides_array) = shift;
my $len = @$peptides_array;
for my $q(0 .. $missed_cleavage){
my $initial=1;
for my $i(0 .. $len-1){
if($i>0){
$initial += length($$peptides_array[$i-1]);
}
my $x = $i + $q;
my $string;
if($x >= $len){
last;
}
for my $j ($i .. $x){
$string = $string.$$peptides_array[$j];
}
my $l = length($string);
my $position = join("-",$initial, $initial+$l-1);
$peptide_hash{$position} = $string;
}
}
}

# this subroutine takes care of doing the analysis for each protein.
sub process(){
if($protein && !($protein =~ /^$/)){
foreach my $i (0 ..$#breakAt){
$protein =~ s/$breakAt[$i]/$breakAt2[$i]/g;
}

# the unsorted array that stores peptides
my @peptides=split(/X/,$protein);

#for (my $i=0; $i<=$#peptides; $i++) {
# print "peptides at $i is $peptides[$i]\n";
#}

$totalProteins++;
if(@peptides){
missedCleavage(\@peptides);
my @sorted = sort { length ($peptide_hash{$b}) <=> length ($peptide_hash{$a})} keys %peptide_hash;
my %rev = reverse %peptide_hash;

%peptide_hash = reverse %rev;

# if user wants the simple file
if($opt_d==1){
# writes the protein name
print SIMPLE "Protein Name: $protein_name\n";
print SIMPLE "\n";
# write the header to output file 1
print SIMPLE " No. Range Isotopic Mass Average Mass Peptide\n";
}

if($opt_a==1){
print ANNOTATED "Protein Name: $protein_name\n";
print ANNOTATED "\n";

# write the header to output file 2
print ANNOTATED " ***Amino Acid Count Within Peptide***\n";
print ANNOTATED "No. A C D E F G H I K L M N P Q R S T V W Y Peptide\n";
print ANNOTATED "------- +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+--------\n";
}

# initialize the

my $i=0;
my $flag;
foreach $a (@sorted){

if(length ($peptide_hash{$a}) >$peptide_length){
$flag=0;
$totalPeptides++;
$totalPepds++;
analyze_peptide(\$peptide_hash{$a},\++$i, \$a, 1);
# if incomplete peptide modificaitons is selected then
# do the substitutions and make this peptide a new peptide
if($opt_N==1){
my $contains=0;
my $copy;
my $count;
my $where;
my $orig;
my @copies;
if($opt_S==1 && $peptide_hash{$a}=~ /S/){
my @mod_peps;
$copy = $peptide_hash{$a};
$orig = $copy;
$count = $copy =~ tr/S//;
$where = index($copy, "S");
substr($copy,$where,1) =~ s/S/s/;
$totalPeptides++;
$totalPepds++;
$copies[0]=$copy;
analyze_peptide(\$copies[0],\++$i,\$a, 1);

for (my $m=1;$m<=$count; $m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /S/) {
$where = index($copies[$m], "S");
substr($copies[$m], $where, 1) =~ s/S/s/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/S/s/;
analyze_peptide(\$temp,\++$i,\$a, 1);
$totalPeptides++;
$totalPepds++;
}

}
@copies = "";
}
if($opt_T==1 && $peptide_hash{$a}=~ /T/){

my @mod_peps;
$copy = $peptide_hash{$a};
$orig = $copy;
$count = $copy =~ tr/T//;
$where = index($copy, "T");
substr($copy,$where,1) =~ s/T/t/;
$totalPeptides++;
$totalPepds++;
$copies[0]=$copy;
analyze_peptide(\$copies[0],\++$i,\$a, 1);

for (my $m=1;$m<=$count; $m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /T/) {
$where = index($copies[$m], "T");
substr($copies[$m], $where, 1) =~ s/T/t/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/T/t/;
analyze_peptide(\$temp,\++$i,\$a, 1);
$totalPeptides++;
$totalPepds++;
}

}
@copies = "";
}

if($opt_Y==1 && $peptide_hash{$a}=~ /Y/){

my @mod_peps;
$copy = $peptide_hash{$a};
$orig = $copy;
$count = $copy =~ tr/Y//;
$where = index($copy, "Y");
substr($copy,$where,1) =~ s/Y/y/;
$totalPeptides++;
$totalPepds++;
$copies[0]=$copy;
analyze_peptide(\$copies[0],\++$i,\$a, 1);

for (my $m=1;$m<=$count; $m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /Y/) {
$where = index($copies[$m], "Y");
substr($copies[$m], $where, 1) =~ s/Y/y/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/Y/y/;
analyze_peptide(\$temp,\++$i,\$a, 1);
$totalPeptides++;
$totalPepds++;
}

}

@copies = "";
}

if($opt_I==1 && $peptide_hash{$a}=~ /C/){

my @mod_peps;
$copy = $peptide_hash{$a};
$orig = $copy;
$count = $copy =~ tr/C//;
$where = index($copy, "C");
substr($copy,$where,1) =~ s/C/c/;
$totalPeptides++;
$totalPepds++;
$copies[0]=$copy;
$mass_C =
$mass_C + 415.5;
$amass_C =
$amass_C + 415.5;
analyze_peptide(\$copies[0],\++$i,\$a, 1);
$mass_C =
103.00919;
$amass_C =
103.1388;
$mass_C =
$mass_C + 423.5;
$amass_C =
$amass_C = 423.5;
analyze_peptide(\$copies[0],\++$i,\$a,
1);
$mass_C =
103.00919;
$amass_C =
103.1388;
$totalPeptides++;
$totalPepds++;
for (my $m=1;$m<=$count; $m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /C/) {
$where = index($copies[$m], "C");
substr($copies[$m], $where, 1) =~ s/C/c/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/C/c/;
$mass_C
= $mass_C + 415.5;
$amass_C
= $amass_C + 423.5;
analyze_peptide(\$temp,\++$i,\$a, 1);
$totalPeptides++;
$totalPepds++;
$mass_C
= 103.00919;
$amass_C
= 103.1388;
$mass_C
= $mass_C + 415.5;
$amass_C
= $amass_C + 423.5;
analyze_peptide(\$temp,\++$i,\$a,1);
$mass_C
= 103.00919;
$amass_C
= 103.1388;
$totalPeptides++;
$totalPepds++;
}

}
@copies = "";
}

if($opt_M==1 && $peptide_hash{$a}=~ /K/){

my @mod_peps;
$copy = $peptide_hash{$a};
$orig = $copy;
$count = $copy =~ tr/K//;
$where = index($copy, "K");
substr($copy,$where,1) =~ s/K/k/;
$totalPeptides++;
$totalPepds++;
$copies[0]=$copy;
analyze_peptide(\$copies[0],\++$i,\$a, 1);

for (my $m=1;$m<=$count; $m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /K/) {
$where = index($copies[$m], "K");
substr($copies[$m], $where, 1) =~ s/K/k/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/K/k/;
analyze_peptide(\$temp,\++$i,\$a, 1);
$totalPeptides++;
$totalPepds++;
}

}

@copies = "";
}

if($opt_R && $opt_W){
$copy = $peptide_hash{$a};
$orig = $copy;

foreach my $aa (@amino_acids){
# now add the given weight to the given AA
if($aa =~ /^A$/ && $copy=~ /A/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/A//;
$where = index($copy, "A");
substr($copy2, $where, 1) =~ s/A/aZ/;
substr($copy, $where, 1) =~ s/A/a/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /A/) {
$where = index($copies[$m], "A");
substr($copies[$m], $where, 1) =~ s/A/a/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/A/aZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}

}

elsif($aa =~ /^C$/ && $copy=~ /C/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/C//;
$where = index($copy, "C");
substr($copy2, $where, 1) =~ s/C/cZ/;
substr($copy, $where, 1) =~ s/C/c/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /C/) {
$where = index($copies[$m], "C");
substr($copies[$m], $where, 1) =~ s/C/c/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/C/cZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^D$/ && $copy=~ /D/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/D//;
$where = index($copy, "D");
substr($copy2, $where, 1) =~ s/D/dZ/;
substr($copy, $where, 1) =~ s/D/d/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /D/) {
$where = index($copies[$m], "D");
substr($copies[$m], $where, 1) =~ s/D/d/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/D/dZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^E$/ && $copy=~ /E/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/E//;
$where = index($copy, "E");
substr($copy2, $where, 1) =~ s/E/eZ/;
substr($copy, $where, 1) =~ s/E/e/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /E/) {
$where = index($copies[$m], "E");
substr($copies[$m], $where, 1) =~ s/E/e/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/E/eZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^F$/ && $copy=~ /F/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/F//;
$where = index($copy, "F");
substr($copy2, $where, 1) =~ s/F/fZ/;
substr($copy, $where, 1) =~ s/F/f/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /F/) {
$where = index($copies[$m], "F");
substr($copies[$m], $where, 1) =~ s/F/f/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/F/fZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^G$/ && $copy=~ /G/){
$flag=1;
my $copy2 = $copy;
$count = $copy =~ tr/G//;
$where = index($copy, "G");
substr($copy2, $where, 1) =~ s/G/gZ/;
substr($copy, $where, 1) =~ s/G/g/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /G/) {
$where = index($copies[$m], "G");
substr($copies[$m], $where, 1) =~ s/G/g/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/G/gZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}

}
elsif($aa =~ /^H$/ && $copy=~ /H/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/H//;
$where = index($copy, "H");
substr($copy2, $where, 1) =~ s/H/hZ/;
substr($copy, $where, 1) =~ s/H/h/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /H/) {
$where = index($copies[$m], "H");
substr($copies[$m], $where, 1) =~ s/H/h/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/H/hZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}

}
elsif($aa =~ /^I$/ && $copy=~ /I/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/I//;
$where = index($copy, "I");
substr($copy2, $where, 1) =~ s/I/iZ/;
substr($copy, $where, 1) =~ s/I/i/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /I/) {
$where = index($copies[$m], "I");
substr($copies[$m], $where, 1) =~ s/I/i/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/I/iZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}

}
elsif($aa =~ /^K$/ && $copy=~ /K/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/K//;
$where = index($copy, "K");
substr($copy2, $where, 1) =~ s/K/kZ/;
substr($copy, $where, 1) =~ s/K/k/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /K/) {
$where = index($copies[$m], "K");
substr($copies[$m], $where, 1) =~ s/K/k/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/K/kZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^L$/ && $copy=~ /L/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/L//;
$where = index($copy, "L");
substr($copy2, $where, 1) =~ s/L/lZ/;
substr($copy, $where, 1) =~ s/L/l/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /L/) {
$where = index($copies[$m], "L");
substr($copies[$m], $where, 1) =~ s/L/l/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/L/lZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
if($aa =~ /^M$/ && $copy=~ /M/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/M//;
$where = index($copy, "M");
substr($copy2, $where, 1) =~ s/M/mZ/;
substr($copy, $where, 1) =~ s/M/m/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /M/) {
$where = index($copies[$m], "M");
substr($copies[$m], $where, 1) =~ s/M/m/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/M/mZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^N$/ && $copy =~/N/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/N//;
$where = index($copy, "N");
substr($copy2, $where, 1) =~ s/N/nZ/;
substr($copy, $where, 1) =~ s/N/n/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /N/) {
$where = index($copies[$m], "N");
substr($copies[$m], $where, 1) =~ s/N/n/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/N/nZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
if($aa =~ /^P$/ && $copy=~ /P/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/P//;
$where = index($copy, "P");
substr($copy2, $where, 1) =~ s/P/pZ/;
substr($copy, $where, 1) =~ s/P/p/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /P/) {
$where = index($copies[$m], "P");
substr($copies[$m], $where, 1) =~ s/P/p/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/P/pZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}

}
elsif($aa =~ /^Q$/ && $copy=~ /Q/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/Q//;
$where = index($copy, "Q");
substr($copy2, $where, 1) =~ s/Q/qZ/;
substr($copy, $where, 1) =~ s/Q/q/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /Q/) {
$where = index($copies[$m], "Q");
substr($copies[$m], $where, 1) =~ s/Q/q/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/Q/qZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^R$/ && $copy=~ /R/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/R//;
$where = index($copy, "R");
substr($copy2, $where, 1) =~ s/R/rZ/;
substr($copy, $where, 1) =~ s/R/r/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /R/) {
$where = index($copies[$m], "R");
substr($copies[$m], $where, 1) =~ s/R/r/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/R/rZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^S$/ && $copy=~ /S/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/S//;
$where = index($copy, "S");
substr($copy2, $where, 1) =~ s/S/sZ/;
substr($copy, $where, 1) =~ s/S/s/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /S/) {
$where = index($copies[$m], "S");
substr($copies[$m], $where, 1) =~ s/S/s/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/S/sZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^T$/ && $copy=~ /T/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/T//;
$where = index($copy, "T");
substr($copy2, $where, 1) =~ s/T/tZ/;
substr($copy, $where, 1) =~ s/T/t/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /T/) {
$where = index($copies[$m], "T");
substr($copies[$m], $where, 1) =~ s/T/t/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/T/tZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}

}
if($aa =~ /^V$/ && $copy=~ /V/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/V//;
$where = index($copy, "V");
substr($copy2, $where, 1) =~ s/V/vZ/;
substr($copy, $where, 1) =~ s/V/v/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /V/) {
$where = index($copies[$m], "V");
substr($copies[$m], $where, 1) =~ s/V/v/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/V/vZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^W$/ && $copy=~ /W/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/W//;
$where = index($copy, "W");
substr($copy2, $where, 1) =~ s/W/wZ/;
substr($copy, $where, 1) =~ s/W/w/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /W/) {
$where = index($copies[$m], "W");
substr($copies[$m], $where, 1) =~ s/W/w/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/W/wZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
elsif($aa =~ /^Y$/ && $copy=~ /Y/){
$flag =1;
my $copy2 = $copy;
$count = $copy =~ tr/Y//;
$where = index($copy, "Y");
substr($copy2, $where, 1) =~ s/Y/yZ/;
substr($copy, $where, 1) =~ s/Y/y/;
$copies[0]=$copy;
analyze_peptide(\$copy2, \++$i, \$a, 0);

for (my $m=1;$m<=$count;$m++) {
$copies[$m]=$copies[$m-1];
if($copies[$m] =~ /Y/) {
$where = index($copies[$m], "Y");
substr($copies[$m], $where, 1) =~ s/Y/y/;
my $temp = $orig;
substr($temp, $where, 1) =~ s/Y/yZ/;
analyze_peptide(\$temp, \++$i, \$a, 0);
$totalPeptides++;
$totalPepds++;

}

}
}
} #end for
#if($flag==1){
# $totalPeptides++;
# $totalPepds++;
# analyze_peptide(\$copy,\++$i,\$a, 0);

#}
} #end if

} # end opt_N
if (($opt_N == 0) && ($opt_I == 1) &&
($peptide_hash{$a} =~ /C/)) {
my $copy = $peptide_hash{$a};
$mass_C = $mass_C + 415.5;
$amass_C = $amass_C + 415.5;
analyze_peptide(\$copy, \++$i,
\$a, 0);
$totalPeptides++;
$totalPepds++;
$mass_C = 103.00919;
$amass_C = 103.1388;
$mass_C = $mass_c + 423.5;
$amass_C = $amass_C + 423.5;
analyze_peptide(\$copy, \++$i,
\$a, 0);
$mass_C = 103.00919;
$amass_C = 103.1388;
$totalPeptides++;
$totalPepds++;
}
}
} #end for

# store all the necessary calculations(for each protein) for the summary
push(@isoMass_protein, $totalIsoMass);
push(@avgMass_protein, $totalAvgMass);

# collect these results only if complete is 1

# for each protein store the total length of the protein
push(@len_protein, $totalLen);
# for each protein store the count of amino acid composition
push(@amino_acid_protein,[@aminoacids_count_protein]);
# for each protein store the total peptides for this protein
push(@peptides_protein, $totalPepds);

# if annotated is chosen by the user then write data to annotated file
if($opt_a==1){
print_final();
}
# re-initializes the values
reinitialize();
initializeValues();
# if user has chosen to have a simple file then write data to simple file
if($opt_d==1){
# write out the number of peptides in each protein
print SIMPLE "Number of peptides = $numpep\n";
print SIMPLE "\n";
}

} # end if
else{
print "There is no proteins to process\n";
exit(0);
}

}

} #end sub

# this subroutine returns the sum of the array elements
sub sum{
my ($dataset) = shift;
my $result;
for (@{$dataset}) {
$result += $_;
}
return $result;
}

# this subroutine returns the mean of a given set of data
sub average{
my ($dataset) = shift;
my $sum = sum(\@{$dataset});

my $len = @{$dataset};
return $sum/$len;

}

#this subrotine returns the standard deviation of a data set
sub standard_deviation{
my ($dataset) = shift;
my $sum_square;
my $sum;
my $n = @{$dataset};
for (@{$dataset}) {
$sum_square += $_ ** 2;
$sum += $_;
}
my $numerator = ($n * $sum_square)-($sum ** 2);
if($n==1){
return sqrt($numerator / ($n*$n));
}
return sqrt($numerator / ($n*($n-1)));
}

########################################### PEPTIDE ANALYSIS SECTION ####################################

# this subroutine calculates the charge of every peptide
sub charge_of_peptides {
my ($p) = shift;
my $peptide = $$p;
my $charge = 0;

for($j = 0; $j<=(length($peptide)); $j++){

if (substr($peptide, $j, 1) eq "D"){
$charge += -1;
}
if (substr($peptide, $j, 1) eq "E"){
$charge += -1;
}
if (substr($peptide, $j, 1) eq "H"){
$charge += 1;
}
if (substr($peptide, $j, 1) eq "K"){
$charge += 1;
}
if (substr($peptide, $j, 1) eq "R"){
$charge += 1;
}

} # end for
return $charge;
}

sub physicalProperties{

my $description = shift;
my $range = shift;
my $max = shift;
my ($dataset) = shift;
my $choice = shift;
my $initial = shift;
my $denominator = $totalProteins;

if($choice =~ /Peptides/){ $denominator = $totalPeptides;}
my $string;
my %buckets;
# build the buckets
my $j=$initial;
my $k=$j;
if($description =~ /^Charge$/){
displayCharge($description,$initial,$denominator,$range,$max,\@{$dataset});
}
else{
for(my $i=1;$i <= $max;$i++){
$k += $range;
my $key = $j."-"."$k";
$buckets{$key} =0;
$j= $k;
}
my $lastKey = ">=$j";
$buckets{$lastKey}=0;
# now fill the buckets
for (@{$dataset}) {
foreach my $key (keys %buckets){
if($key =~ /^>=/){
my $n = $key;
$n =~ s/^>=//g;
if($_ >= $n){
$buckets{$key}++;
}
}else{
my $g; my $h;
if($initial<0){
my $dup = $key;
if($dup =~ /--/){
$dup =~ s/\--/\^-/g;
($g,$h) = split (/\^/,$dup,2);
}
elsif($key =~ /^-1-0$/){
$g = -1;
$h=0;
}
else{
($g,$h) = split (/\-/,$key,2);
}

}
else{
($g,$h) = split (/\-/,$key,2);
}
if(($_ > $g) && ($h >= $_)){
$buckets{$key}++;
}
else{
next;
}
}
}
}

$string = "<table border celllpadding=10 cellspacing=0 align=center width=100%>\n";
$string.=" <tr>\n";
$string.="<th bgcolor=#800000>$description</th>\n";

foreach my $key (sort { my ($v,$y)= split("_",$a); $v<=>$b}keys %buckets){
if($key =~ />/){
$lastKey = $key;
}
else{
my ($a, $b);
if($key =~ /^\-/){
$b = $key;
$b =~ s/^\-\d+\-//g;
$a = $key;
if($a =~ /\-\-/){
$a =~ s/\-\-\d+//g;
}
elsif($a =~ /\-\d+$/){
$a =~ s/\-\d+$//g;
}
}
else{
($a, $b) = split("-",$key);
}
$string .= "<th bgcolor=#800000>$a - $b</th>\n";
}
}
$string .= "<th bgcolor=#800000>$lastKey</th>\n";
$string .= "</tr>\n";
$string .= "<tr>\n";
$string.="<th align=left>Count</th>\n";
foreach my $key (sort {my ($v,$y)= split("_",$a); $v<=>$b}keys %buckets){
if($key =~ />/){
$lastKey = $key;
}
else{
$string .= "<td align=center>$buckets{$key}</td>\n";
}
}
$string .= "<td align=center>$buckets{$lastKey}</td>\n";
$string .= "</tr>\n";
$string.=" <tr>\n";
$string.="<th align=left>Fraction</th>\n";
foreach my $key (sort {my ($v,$y)= split("_",$a); $v<=>$b}keys %buckets){
if($key =~ />/){
$lastKey = $key;
}
else{
my $val = $buckets{$key}/$denominator;
$val = sprintf("%.3f",$val);
$string .= "<td align=center>$val</td>\n";
}
}
my $val = $buckets{$lastKey}/$denominator;
$val = sprintf("%.3f",$val);
$string .= "<td align=center>$val</td>\n";
$string .= "</tr>\n";
$string .= "</table> \n";
print SUMMARY $string;
}

}

sub displayCharge{
# print "IN display charge\n";
my $string;
my $description = shift;
my $initial = shift;
my $denominator = shift;
my $range = shift;
my $max = shift;
my ($dataset) = shift;
my %buckets;
my $j = $initial;
for(my $i=$initial;$i <= $max;$i++){
my $key = $j;
$buckets{$key} =0;
$j = $j +1;
}
# print "J is $j \t$range\t$max\n";
#sleep 5;
# now fill the buckets
for (@{$dataset}) {
if($_>$max){
$buckets{$max}++;
}
elsif($_ <$initial){
$buckets{$initial}++;
}
else{
$buckets{$_}++;
}
# print "VAL IS $_\n";
#sleep 1;
#$buckets{$_}++;
}
$string = "<table border celllpadding=10 cellspacing=0 align=center width=100%>\n";
$string.=" <tr>\n";
$string.="<th bgcolor=#800000>$description</th>\n";
foreach $a (sort{$a <=>$b} keys %buckets){
if($a >0){ $a = "+$a";}
if($a==$max){
$a .= ">=";
}
if($a==$initial){
$a.= "<=";
}
$string .= "<th bgcolor=#800000>$a</th>\n";
# print " $a : $buckets{$a}\n";
}
$string .= "</tr>\n";
$string .= "<tr>\n";
$string.="<th align=left>Count</th>\n";
foreach $a (sort{$a <=>$b} keys %buckets){
$string .= "<td align=center>$buckets{$a}</td>\n";

}
$string .= "</tr>\n";
$string.=" <tr>\n";
$string.="<th align=left>Fraction</th>\n";
foreach $a (sort{$a <=>$b} keys %buckets){
my $val = $buckets{$a}/$denominator;
$val = sprintf("%.3f",$val);
$string .= "<td align=center>$val</td>\n";

}

$string .= "</tr>\n";
$string .= "</table> \n";
print SUMMARY $string;

}

sub physicalStatistics{
my $choice = shift;
my $BODY;
$BODY=<<END;
<table border celllpadding="10" cellspacing="0" align="center" width="100%">
<tr>
<th bgcolor="#800000">Description</th>
<th bgcolor="#800000">Mean</th>
<th bgcolor="#800000">Std. Dev.</th>
<th bgcolor="#800000">Maximum</th>
<th bgcolor="#800000">Minimum</th>
<th bgcolor="#800000">Total</th>
</tr>
END
print SUMMARY $BODY;
if($choice =~ /^Protein$/){
fillStatistics("Length",\@len_protein);
fillStatistics("Isotopic Mass",\@isoMass_protein);
fillStatistics("Average Mass", \@avgMass_protein);
}

elsif($choice =~ /^Peptide$/){
fillStatistics("No. of Peptides/Protein",\@peptides_protein);
}
elsif($choice =~ /^Protein\/AA$/){
aminoAcidStatistics();
}
print SUMMARY "</table>\n";

}

# this subroutine arranges the data for peptides to be displayed
sub aminoAcidRepPeptide{
for my $a (0 .. $#amino_acid_peptide){
my $description = aminoAcids($amino_acid_peptide[$a][0]);
my @temp;
for my $i (1 .. $#{$amino_acid_peptide[$a]}){
my($val,$bool) = split(/=/,$amino_acid_peptide[$a][$i]);
push(@temp,$val);

}
displayAAOccurences($description,\@temp);
}

}

sub aminoAcids{
my $AA = shift;
if($AA =~ /^A$/){ $AA ="Alanine(A)"; }
if($AA =~ /^C$/){ $AA ="Cysteine(C)";}
if($AA =~ /^D$/){ $AA ="Aspartic Acid(D)";}
if($AA =~ /^E$/){ $AA ="Glutamic Acid(E)";}
if($AA =~ /^F$/){ $AA ="Phenylalanine(F)";}
if($AA =~ /^G$/){ $AA ="Glycine(G)";}
if($AA =~ /^H$/){ $AA ="Histidine(H)";}
if($AA =~ /^I$/){ $AA ="Isoleucine(I)";}
if($AA =~ /^K$/){ $AA ="Lysine(K)";}
if($AA =~ /^L$/){ $AA ="Leucine(L)";}
if($AA =~ /^M$/){ $AA ="Methionine(M)";}
if($AA =~ /^N$/){ $AA ="Asparagine(N)";}
if($AA =~ /^Q$/){ $AA ="Glutamine(Q)";}
if($AA =~ /^P$/){ $AA ="Proline(P)";}
if($AA =~ /^R$/){ $AA ="Arginine(R)";}
if($AA =~ /^S$/){ $AA ="Serine(S)";}
if($AA =~ /^T$/){ $AA ="Threonine(T)";}
if($AA =~ /^V$/){ $AA ="Valine(V)";}
if($AA =~ /^W$/){ $AA ="Tryptophan(W)";}
if($AA =~ /^Y$/){ $AA ="Tyrosine(Y)";}
return $AA;
}

# protein for the proteome
sub displayAAOccurences{

my $AA = shift;
my ($dataset) =shift;
my $string;
my $description = aminoAcids($AA);

$string = "<table border celllpadding=10 cellspacing=0 align=center width=100%>\n";
$string .= "<tr>\n";
$string .= "<th bgcolor=#800000>$description</th>\n";
for my $a (0 .. @{$dataset}-1) {
$string .= "<th bgcolor=#800000>$a</th>\n";
}
$string .= "</tr>\n";
$string .= "<tr>\n";
$string .= "<th align=left>Count</th>\n";
for my $a (0 .. @{$dataset}-1) {
$string .= "<td align=center>@{$dataset}[$a]</td>\n";
}
$string .= "</tr><tr>\n";
$string .= "<th align=left>Fraction</th>\n";
for my $a (0 .. @{$dataset}-1) {
my $fraction = (@{$dataset}[$a])/($totalProteins);
$fraction = sprintf("%.3f",$fraction);
$string .= "<td>$fraction</td>\n";
}

$string .= "</tr>\n";
$string .= "</table> \n";
print SUMMARY $string ;
}

# this subroutine takes care of displaying amino acid occurences per protein
# for the proteome
sub aminoAcidOccurences(){
my $max = 50;
my $AA; my $val;

for my $i (0 .. $#AA_perProtein){
my @occurances;
for my $a (0 .. $max-1){
$occurances[$a]=0;
}
for my $j (0 .. $#{$AA_perProtein[$i]}){
($AA,$val) = split(/=/,$AA_perProtein[$i][$j]);
if($val < $max){
$occurances[$val]++;
}
}
displayAAOccurences($AA,\@occurances);
@occurances=();
}

}

# caclulates and displays amino acid composition for each protein
sub aminoAcidStatistics(){
my $amino_acid;
my $HEADER;
my $val;
my $max;
for my $i (0 .. 19){
my @amino_acid_count;
my @AA;
for my $a (0 ..$#amino_acid_protein) {
if($amino_acid_protein[$a][0] !~ /^$/){
($amino_acid,$val) = split(/=/,$amino_acid_protein[$a][0]);
}
else{ $val =0;}
push(@amino_acid_count,$val);
# remove the element
shift @{$amino_acid_protein[$a]};
# rebuild the array for later use
my $element = join("=",$amino_acid,$val);
push(@AA,$element);
}
push (@AA_perProtein,[@AA]);
# keeps track of the maximum amount of amino acid per protein for the
# displaying of occurences of amino acids
$max = fillStatistics($amino_acid,\@amino_acid_count);
}

}

sub fillStatistics(){
my $description = shift;
my ($dataset) = shift;
my @dup = @{$dataset};
@dup = sort{$b<=>$a} @dup;
my $string;
my $avg = average(\@dup);
$avg = sprintf("%.3f",$avg);
my $std = standard_deviation(\@dup);
$std = sprintf("%.3f",$std);
my $total = sum(\@dup);
$total = sprintf("%.0f",$total);
my $max = $dup[0];
my $min = $dup[$#dup];
if($description =~ /^Isotopic Mass$/ ||$description =~ /^Average Mass$/||$description =~ /^Isoelectric Point$/ ){
$avg = $b = sprintf("%.2f", $avg);
$max = sprintf("%.0f",$max);
$min = sprintf("%.0f",$min);
$total = sprintf("%.2f",$total);
}
$string = "<tr>\n";
$string .= " <th align=left>$description</th>\n";
$string .= " <td align=center>$avg</td>\n";
$string .= " <td align=center>$std</td>\n";
$string .= " <td align=center>$max</td>\n";
$string .= " <td align=center>$min</td>\n";
$string .= " <td align=center>$total</td>\n";
$string .= "</tr>\n";
print SUMMARY $string;
# useful only when displaying the amino acid occurence
return ($dup[0]);
}

sub writeSummary(){
my $cleavage = join('X,',@breakAt);
$cleavage .= 'X';
my $HEADER;
$HEADER=<<END;
<html>
<head>
<title> Proteome Summary</title>
</head>

<body>
ProteoGest ANALYSIS of '$input_file' digested with $cleavage
 
Options: -i $opt_i -d $opt_d -a $opt_a -c $opt_c -s $opt_s -S $opt_S $opt_T $opt_Y $opt_R $opt_W $opt_I $opt_M $opt_C $opt_N $opt_G $opt_L -H $opt_H $opt_h
TOTAL NUMBER OF PEPTIDES: $totalPeptides
TOTAL NUMBER OF PROTEINS: $totalProteins
PROTEIN ANALYSIS of '$input_file' proteome
END
print SUMMARY $HEADER;
physicalStatistics("Protein");
$HEADER=<<END;
 
Distribution of Protein Properties for '$input_file'
 
END
print SUMMARY $HEADER;
physicalProperties("Length(AA)",100,10,\@len_protein,"Proteins",0);
physicalProperties("Isotopic Mass",10000,10,\@isoMass_protein,"Proteins",0);
physicalProperties("Average Mass",10000,10,\@avgMass_protein,"Proteins",0);
$HEADER=<<END;
Protein Statistics (Amino Acid Composition) 
<table border celllpadding="10" cellspacing="0" align="center" width="100%">
END
print SUMMARY $HEADER;
physicalStatistics("Protein/AA");

$HEADER=<<END;
 
Protein Statistics (Residue Representation)
This table lists the frequency of each amino acid residue per protein for the proteome
END
print SUMMARY $HEADER;
aminoAcidOccurences();
$HEADER=<<END;
 
PEPTIDE ANALYSIS for '$input_file' digested with $cleavage
Peptide statistics(Digestion of '$input_file' with $cleavage)
END
print SUMMARY $HEADER;
physicalStatistics("Peptide");

$HEADER=<<END;
 
Distribution of peptide properties in '$input_file' proteome following digestion with $cleavage
 
END
print SUMMARY $HEADER;
physicalProperties("Length(AA)",5,10,\@len_peptide,"Peptides",0);
physicalProperties("Isotopic Mass",500,10,\@isoMass_peptide,"Peptides",0);
physicalProperties("Average Mass",500,10,\@avgMass_peptide,"Peptides",0);
physicalProperties("Hydrophobicity",1,10,\@hydrophobicity_peptide,"Peptides",-4);
physicalProperties("Charge",1,10,\@charge_peptide,"Peptides",-6);
$HEADER=<<END;
Protein-Peptide Statistics (Amino Acid Representation)
This Table lists the number (and
fraction) of proteins with peptides with the tabulated number of each amino acid eg.
the first table lists the count of proteins with peptides that contain at least
0,1,2,3,4,5,6,7,8,9,10,... alanines (top row). The next row
gives the same data represented as the fraction of all proteins.
<table border celllpadding="10" cellspacing="0" align="center" width="100%">
END
print SUMMARY $HEADER;
aminoAcidRepPeptide();

if($opt_H==1 &&$display_redun==1){
# display the redundancy if asked by the user
print SUMMARY "Redundant Peptides\n";
print SUMMARY "This Table lists peptides that occur in more than one protein\n";
displayRedundancy();
}
elsif($opt_H==1 &&$display_redun==0){
# display the redundancy if asked by the user
print SUMMARY "Redundant Peptides\n";
print SUMMARY " NO REDUNDANCY DETECTED IN THIS PROTEOME!\n";
}
print SUMMARY "</body>\n";
print SUMMARY "</html>\n";
}

# this subroutine displays the redundancy peptides in the summary file
sub displayRedundancy(){
my $HEAD;
my $string;
$HEAD=<<END;

<center>
<table border="1" celllpadding="0" cellspacing="0" style="border-collapse: collapse" align="center" width="80%">
<tr>
<th width='25%' bgcolor='#800000'>Peptide</th>
<th width= '75%' bgcolor='#800000'>Name of the protein</th>
</tr>
END
print SUMMARY $HEAD;

foreach my $b (sort keys %redun_hash){

if(@{$redun_hash{$b}} > 1){
$string = "<tr>\n<td width='25%'>$b</td>\n";
$string .= "<td width='75%'>\n";
my $i=0;
foreach my $protein (@{$redun_hash{$b}}){
++$i;
$string .= "$i) $protein \n";
#print "REDUN: $b=>$protein,\n";
}
$string .= "</td>\n</tr>\n";

print SUMMARY $string;
}
}
print SUMMARY "</table></center>\n";
}

# The MAX number defined here can be changed. The MAX number defines
# the column headers. Column 0 contains all the amino acids.
sub initilizePeptideStatisticsArray{
# there are 20 amino acids
$amino_acid_peptide[0][0] ="A";
$amino_acid_peptide[1][0] ="C";
$amino_acid_peptide[2][0] ="D";
$amino_acid_peptide[3][0] ="E";
$amino_acid_peptide[4][0] ="F";
$amino_acid_peptide[5][0] ="G";
$amino_acid_peptide[6][0] ="H";
$amino_acid_peptide[7][0] ="I";
$amino_acid_peptide[8][0] ="K";
$amino_acid_peptide[9][0] ="L";
$amino_acid_peptide[10][0] ="M";
$amino_acid_peptide[11][0] ="N";
$amino_acid_peptide[12][0] ="P";
$amino_acid_peptide[13][0] ="Q";
$amino_acid_peptide[14][0] ="R";
$amino_acid_peptide[15][0] ="S";
$amino_acid_peptide[16][0] ="T";
$amino_acid_peptide[17][0] ="V";
$amino_acid_peptide[18][0] ="W";
$amino_acid_peptide[19][0] ="Y";
for my $q (0 .. 19){
for my $z (1 .. MAX){
$amino_acid_peptide[$q][$z]="0=F";
}
}

}

# After each protein the elements in amino_acid_peptide multi- dimensional
# array gets re-initilized for the next count.
sub initializeValues{
for my $v (0 .. $#amino_acid_peptide){
for my $w (1 .. $#{$amino_acid_peptide[$v]}){
my ($val,$bool) =split(/=/,$amino_acid_peptide[$v][$w]);
$amino_acid_peptide[$v][$w] = "$val"."=F";
}
}
}

# this is the entry point to the program
# reads the file until the eof is satisfied
sub main(){

# if user wants the simple file
if($opt_d==1){
# writes the protein name
print SIMPLE "$opt_i digested with $opt_c\n";
print SIMPLE "Options: -i $opt_i -d $opt_d -a $opt_a -c $opt_c -s $opt_s -S $opt_S $opt_T $opt_Y $opt_R $opt_W $opt_I $opt_M $opt_C $opt_N $opt_G $opt_L -H $opt_H $opt_h \n";
print SIMPLE "\n";
print SIMPLE "\n";
}

if($opt_a==1){
print ANNOTATED "$opt_i digested with $opt_c\n";
print ANNOTATED "Options: -i $opt_i -d $opt_d -a $opt_a -c $opt_c -s $opt_s -S $opt_S $opt_T $opt_Y $opt_R $opt_W $opt_I $opt_M $opt_C $opt_N $opt_G $opt_L -H $opt_H $opt_h \n";
print ANNOTATED "\n";
print ANNOTATED "\n";
}

while (<INPUT_FASTA>) {
# gets rid of carriage return
$_ =~ s/\r//g;
# gets rid of new line character
chomp ($_);
# discards the first line which does not contain any amino acids
if($_ =~ /^>/){
process();
$protein_name = $_;
$protein_name =~ s/^\>//g;
undef $protein;
undef %peptide_hash;
;
}
else{
# reads each line containing amino acids of one protein and joins them to create the entire protein
# stops when the new protein is reached
$protein = $protein.$_;
}
}
process();

if($opt_s==1){
writeSummary();
}

} # end main