/*                                                                                                 */
/* dig - in silico digestion of proteins                                                           */
/*                                                                                                 */
/* (c) Magnus Palmblad, Division of Ion Physics, Uppsala University, 2000.                         */ 
/*                                                                                                 */
/* Usage: dig sequence_list cleavage_rules modifications                                           */
/*                                                                                                 */
/* compile with gcc -o dig dig.c -lm                                                               */
/*                                                                                                 */ 
/*                                                                                                 */

#include <stdio.h>
#include <stdlib.h>  
#include <ctype.h>
#include <string.h>
#include <math.h>

#define MAX_ELEMENTS 5 
#define AMINO_ACIDS "ARNDCEQGHILKMFPSTWYV"
#define MAX_CLEAVE 41
#define HPLUS_MASS 1.00727646688
#define MAX_FRAGMENT_LENGTH 1000


int main(int argc, char *argv[]) 
{
  char line[250];      
  char infile[85], outfile[85]; /* for reading input and writing output */
  char slask[85];    
  char sequence[8000]; /* for protein sequence */
  char cleave_at[MAX_CLEAVE]; /* aa of cleavage (only one aa long) */ 
  char cleave_side[MAX_CLEAVE]; /* N or C */
  char cleave_name[MAX_CLEAVE][15];
  char dont_cleave_at[MAX_CLEAVE]; /* aa of cleavage (only one aa long) */ 
  char dont_cleave_side[MAX_CLEAVE]; /* N or C */
  char dont_cleave_name[MAX_CLEAVE][15];
  char modification_at[20]; /* i.e. carbamidomethylations of cysteins */
  int position_modification_at[100]; /* i.e. position-specific modifications */
  char modification_name[20][40];
  char position_modification_name[100][40];
  char modification_always[20][4]; /* yes or no */
  char position_modification_always[100][4]; /* yes or no */
  int  modification[20][MAX_ELEMENTS]; /* encoding of modification */
  int  position_modification[100][MAX_ELEMENTS]; /* encoding of modification */
  char sequence_file[25000][60]; /* for reading in sequence list */
  char sequence_line[70][80]; /* for reading FASTA file */
  struct fragment_type {
    int begin;
    int end;
    char sequence[MAX_FRAGMENT_LENGTH];
    char modifications[200];
    int fragment[5]; /* for elemental composition of fragment */
    float mass;
  } fragment_list[20000], temp_list[1024]; /* for generated fragments */
  
  

  typedef struct
  {
    int index;
    float mass;
  } sort_type;
  sort_type sort_list[32760];

  int cleavage_site[500];
  char output[95];

  char *p,delimiter[5];

  FILE *inp, *outp;

  unsigned char aa[20][5]={{5,3,1,1,0}, /* amino acid compositions */
			   {12,6,4,1,0},
			   {6,4,2,2,0},
			   {5,4,1,3,0},
			   {5,3,1,1,1},
			   {7,5,1,3,0},
			   {8,5,2,2,0},
			   {3,2,1,1,0},
			   {7,6,3,1,0},
			   {11,6,1,1,0},
			   {11,6,1,1,0},
			   {12,6,2,1,0},
			   {9,5,1,1,1},
			   {9,9,1,1,0},
			   {7,5,1,1,0},
			   {5,3,1,2,0},
			   {7,4,1,2,0},
			   {10,11,2,1,0},
			   {9,9,1,2,0},
			   {9,5,1,1,0}};
  
  double mimass[5]={1.0078250321,12,14.0030740052,15.9949146221,31.97207069}; /* ok, maybe a few more decimals than needed... */
  double mass;
  int a,b,n2,charge, n_cleave, n_dont_cleave, n_sequences, n_sites, sequence_counter, temp_counter;
  int i,j,k,l,m,seqlen, n_modifications,n_position_modifications, n_fragments, max_missed, min_length, index, temp, X;
  int fragment[5]; /* for elemental composition of fragment */
  long n;
   
  int partition(sort_type A[], int first, int last) 
    { 
      int pivindx, top, i; sort_type pivot;  /* Choose a pivot: select a random index between first and last.*/
      i = rand() % (last - first + 1) + first;  /* Put the pivot first, remember pivot, initialise ready for loop.*/
      pivot.index = A[i].index; /* remember the pivot */
      pivot.mass = A[i].mass; /* remember the pivot */
      A[i].index = A[first].index; 
      A[i].mass = A[first].mass;
      A[first].index = pivot.index; /* pivot now first */
      A[first].mass = pivot.mass; /* pivot now first */

      pivindx = first; top = last;   while (top > pivindx) { /* Still unknown elements */
	/* top indicates the highest unknown element */
	if (A[top].mass >= pivot.mass) { top--; /* where it belongs, count as >=*/
	} else { 
	  A[pivindx].index = A[top].index; /* shift down */
	  A[pivindx].mass = A[top].mass; /* shift down */
	  A[top].index = A[pivindx + 1].index;/*shift displaced element up*/
	  A[top].mass = A[pivindx + 1].mass;/*shift displaced element up*/
	A[pivindx + 1].index = pivot.index; /* Put pivot back */
	A[pivindx + 1].mass = pivot.mass; /* Put pivot back */
	pivindx++; /* Alter record of pivot location*/
	} 
      } 
      return pivindx; 
    }

  void quicksort(sort_type A[], int first, int last) 
    { 
      int pivindx; /* index of the element separating the two sub-arrays*/
      if (last > first) { /* More than one element to be sorted?*/
	pivindx = partition(A, first, last); 
	quicksort(A, first, pivindx - 1); 
	quicksort(A, pivindx + 1, last); 
      } 
    }  


  /* parsing command line parameters */
  
  if ((argc!=4))
    {
      printf("Usage: dig sequence_list cleavage_rules modifications\n");
      return 0;
    }
  
  printf("reading cleavage rules...\n");
  
  /* reading cleavage rules file */
  
  strcpy(infile,argv[2]);
  if ((inp = fopen(infile, "r"))==NULL) {printf("error opening cleavage rules file\n");return -1;}
  n=0; 
  fgets(line, 250, inp); /* read past first two lines */
  fgets(line, 250, inp);
  while (fgets(line, 250, inp) != NULL)
    {
      if (strcmp(line,"\n")==0) break; 
      p=strtok(line," ");   
      cleave_at[n]=p[0];
      p=strtok('\0'," ");
      cleave_side[n]=p[0];
      p=strtok('\0'," ");
      strcpy(cleave_name[n],p);
      //cleave_name[n][strlen(p)-1]='\0'; /* PC text files */
         cleave_name[n][strlen(p)]='\0'; /* PC text files */
      n++;
    }
  n_cleave=n;
 
  fgets(line, 250, inp); /* read past next lines */
  fgets(line, 250, inp);
  p=strtok(line," ");
  max_missed=atoi(p);
  p=strtok('\0'," ");
  min_length=atoi(p);
  fgets(line, 250, inp);
  fgets(line, 250, inp);
  fgets(line, 250, inp);
  n=0;
  while (fgets(line, 250, inp) != NULL)
    {
      if (strcmp(line,"\n")==0) break; 
      p=strtok(line," ");   
      dont_cleave_at[n]=p[0];
      p=strtok('\0'," ");
      dont_cleave_side[n]=p[0];
      p=strtok('\0'," ");
      strcpy(dont_cleave_name[n],p);
      dont_cleave_name[n][strlen(p)]='\0'; /* PC text files */
      n++;
    }
  n_dont_cleave=n;
  close(inp);

  printf("cleavage rules:\n");
  for(i=0;i<n_cleave;i++) printf("cleave on %c-terminal side of %c\n",cleave_side[i],cleave_at[i]);
  printf("maximum %i cleavage sites missed\n",max_missed);
  printf("minimum %i aa fragment length\n",min_length);
  for(i=0;i<n_dont_cleave;i++) printf("don't cleave at %c\n",dont_cleave_at[i]);

  printf("reading sequence list...\n"); 
 
  /* reading sequence list */
  
  strcpy(infile,argv[1]);
  if ((inp = fopen(infile, "r"))==NULL) {printf("error sequence list file\n");return -1;}
  n=0;  
  while (fgets(line, 250, inp) != NULL)
    {
      if (strcmp(line,"\n")==0) continue;
      p=strtok(line," \n");   
      strcpy(sequence_file[n],p);
      n++;
    }
  n_sequences=n;
  close(inp);

  printf("reading modifications...\n");
  
  /* reading modifications file */
  
  strcpy(infile,argv[3]);
  if ((inp = fopen(infile, "r"))==NULL) {printf("error opening modifications file\n");return -1;}
  n=0;n2=0;
  fgets(line, 250, inp); /* read past first line */
  while (fgets(line, 250, inp) != NULL)
    {
      if (strcmp(line,"\n")==0) break; 
      p=strtok(line," ");
      if(atoi(p)!=0) {
	position_modification_at[n2]=atoi(p);      
	p=strtok('\0'," "); 
	strcpy(position_modification_name[n2],p);
	p=strtok('\0'," ");
	strcpy(position_modification_always[n2],p);
	for(i=0;i<MAX_ELEMENTS;i++)
	  {
	  p=strtok('\0'," ");
	  position_modification[n2][i]=atoi(p);
	}
      n2++;
      }
      else {
	modification_at[n]=p[0];
	p=strtok('\0'," "); 
	strcpy(modification_name[n],p);
	p=strtok('\0'," ");
	strcpy(modification_always[n],p);
	for(i=0;i<MAX_ELEMENTS;i++)
	  {
	    p=strtok('\0'," ");
	  modification[n][i]=atoi(p);
	  }
	n++;
      }
    }
  n_modifications=n; n_position_modifications=n2;
      close(inp);
       
  for(sequence_counter=0;sequence_counter<n_sequences;sequence_counter++)
    {
      printf("reading sequence file %s\n",sequence_file[sequence_counter]);

      /* reading sequence file */
      
      strcpy(delimiter," \n");
     

      strcpy(infile,sequence_file[sequence_counter]);
      if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence file\n");return -1;}
      n=0;
      fgets(line, 250, inp); /* read FASTA file header */
      while (fgets(line, 250, inp) != NULL)
	{
	  if (strcmp(line,"\n")==0) continue;
	  p=strtok(line,delimiter);
	  strncpy(sequence_line[n],p,70);
	  n++;
	}
      close(inp);
      
      strcpy(sequence,sequence_line[0]);
      for(i=1;i<n;i++) strcat(sequence,sequence_line[i]);
      //sequence[strlen(sequence)]='\0'; /* to work with PC text files - remove last ^M */
      //sequence[strlen(sequence)-2]='\0'; /* to work with PC text files - remove last ^M ??? */
      printf("read sequence %s\n",sequence);
      
     
      n=1;index=0;
      cleavage_site[0]=0; /* always N-term end */
      
      /* find and mark cleavage sites according to the cleavage rules */

      for(i=index;i<strlen(sequence);i++)
	{
	  temp=1;
	  for(k=0;k<n_cleave;k++) temp*=sequence[i]-cleave_at[k]; /* = 0 iff residue is in cleave_at */	  
	  if(temp==0) {
	    temp=1;
	    for(k=0;k<n_dont_cleave;k++) temp*=sequence[i+1-2*(dont_cleave_side[k]=='C')]-dont_cleave_at[k]; /* do we have a dont_cleave_at? */	
	    if(temp!=0) {cleavage_site[n]=i;index++;n++;} /* no, we don't, and we have cleavage site here */ 
	  }
	}
      cleavage_site[n]=strlen(sequence)-1; /* always C-term end - ^M in string (PC)*/
      n++;
      n_sites=n;
      
    
      //for(i=0;i<n_sites;i++) printf("%i ",cleavage_site[i]);
      //printf("\n");
      n=0;
      for(index=0;index<n_sites-1;index++)
	{
	  for(i=1;i<=max_missed+1;i++)
	    {
	      if(index+i<n_sites) {
		seqlen=cleavage_site[index+i]-cleavage_site[index];
		if(seqlen>=min_length) 
		  {
		    for(k=0;k<n_cleave;k++) if(cleave_at[k]==sequence[cleavage_site[index]]) break;
		    if(cleave_side[k]=='C') fragment_list[n].begin=cleavage_site[index]+1; /* C-terminal cleavage */
		    else fragment_list[n].begin=cleavage_site[index]; /* N-terminal cleavage */
		    
		    for(k=0;k<n_cleave;k++) if(cleave_at[k]==sequence[cleavage_site[index+i]]) break;
		    if(cleave_side[k]=='C') fragment_list[n].end=cleavage_site[index+i]; /* C-terminal cleavage */
		    else fragment_list[n].end=cleavage_site[index+i]-1; /* N-terminal cleavage */

		    if(index==0)  /* N-terminal fragment (0) */
		      {
			fragment_list[n].begin=0;
			fragment_list[n].end=cleavage_site[index+i]; 
		      }
		    if(index+i==n_sites-1) fragment_list[n].end=cleavage_site[index+i]; /* C-terminal fragment ? */

		    seqlen=fragment_list[n].end-fragment_list[n].begin+1;
		    
		    for(j=0;j<MAX_FRAGMENT_LENGTH;j++) fragment_list[n].sequence[j]='\0'; /* to fix a bug - only first file works!? */
		    strncpy(fragment_list[n].sequence,&sequence[fragment_list[n].begin],seqlen);
		    
		    if(index==0) strncpy(fragment_list[n].sequence,&sequence[fragment_list[n].begin],seqlen);
		    if(index+i==n_sites-1) strncpy(fragment_list[n].sequence,&sequence[fragment_list[n].begin],seqlen-1);
		    
		    
		    
		    /* GENERATE MOLECULAR FORMULAE FOR ALL POSSIBLE MODIFICATIONS OF THE FRAGMENT */
		    
		    temp_list[0].mass=0.0;
		    temp_list[0].begin=fragment_list[n].begin;
		    temp_list[0].end=fragment_list[n].end;
		    strcpy(temp_list[0].sequence,fragment_list[n].sequence);
		    //strcpy(temp_list[0].modifications,fragment_list[n].modifications); wrong?
		    strcpy(temp_list[0].modifications,"\0");
		    for(b=0;b<MAX_ELEMENTS;b++) temp_list[0].fragment[b]=0; 
		    temp_counter=1;X=0;
		    
		    for(j=0;j<seqlen;j++) /* generate molecular formula for fragment */
		      {
			if((fragment_list[n].sequence[j]=='X') || (fragment_list[n].sequence[j]=='Z') || (fragment_list[n].sequence[j]=='B')) {X=1;break;}
			
			if(strchr(AMINO_ACIDS,fragment_list[n].sequence[j])==NULL) {X=1;break;}
			
			a=20-strlen(strchr(AMINO_ACIDS,fragment_list[n].sequence[j])); /* current amino acid */
			
			for(b=0;b<MAX_ELEMENTS;b++) for(m=0;m<temp_counter;m++) temp_list[m].fragment[b]+=aa[a][b]; /* add aa to all in temp_list */

			for(l=0;l<n_modifications;l++) /* is current residue among residue type modifications? */
			  {
			    if(a==20-strlen(strchr(AMINO_ACIDS,modification_at[l])))
			      {
				if(strcmp(modification_always[l],"no")==0) {
				  /* double temp_list and add/subtract modification to first half of new list */
				  for(m=0;m<temp_counter;m++) {
				    temp_list[temp_counter+m].begin=temp_list[m].begin;
				    temp_list[temp_counter+m].end=temp_list[m].end;
				    strcpy(temp_list[temp_counter+m].sequence,temp_list[m].sequence);
				    strcpy(temp_list[temp_counter+m].modifications,temp_list[m].modifications);
				    for(temp=0;temp<MAX_ELEMENTS;temp++) temp_list[temp_counter+m].fragment[temp]=temp_list[m].fragment[temp];
				  }

				  for(m=0;m<temp_counter;m++) {
				    for(b=0;b<MAX_ELEMENTS;b++) temp_list[m].fragment[b]+=modification[l][b]; /* add/subtract modification */
				    sprintf(temp_list[m].modifications,"%s%s@%i ",temp_list[m].modifications,modification_name[l],temp_list[m].begin+j+1);
				  }
				  temp_counter*=2;
				}
				else { 
				  /* add/subtract modification to whole list */ 
				  for(m=0;m<temp_counter;m++) {
				    if(a==20-strlen(strchr(AMINO_ACIDS,modification_at[l])))
				      {
					for(b=0;b<MAX_ELEMENTS;b++) temp_list[m].fragment[b]+=modification[l][b]; /* add/subtract modification */
					sprintf(temp_list[m].modifications,"%s%s@%i ",temp_list[m].modifications,modification_name[l],temp_list[m].begin+j+1);
				      }
				  }
				}
			      } 
			  }
		      	
			for(l=0;l<n_position_modifications;l++) /* positional modifications? */
			  {
			    if(j+1==position_modification_at[l])
			      {
				if(strcmp(position_modification_always[l],"no")==0) { printf("no!!! %i\n",temp_counter);
				/* double temp_list and add/subtract pos. modification to first half of new list */
				for(m=0;m<temp_counter;m++) {
				  temp_list[temp_counter+m].begin=temp_list[m].begin;
				  temp_list[temp_counter+m].end=temp_list[m].end;
				  strcpy(temp_list[temp_counter+m].sequence,temp_list[m].sequence);
				  strcpy(temp_list[temp_counter+m].modifications,temp_list[m].modifications);
				  for(temp=0;temp<MAX_ELEMENTS;temp++) temp_list[temp_counter+m].fragment[temp]=temp_list[m].fragment[temp];
				}				  
				
				for(m=0;m<temp_counter;m++)
				  {
				    for(b=0;b<MAX_ELEMENTS;b++) temp_list[m].fragment[b]+=position_modification[l][b]; /* add/subtract modification */
				    sprintf(temp_list[m].modifications,"%s%s@%i ",temp_list[m].modifications,position_modification_name[l],position_modification_at[l]);
				  }
				temp_counter*=2;
				}
				else {
				  /* add/subtract pos. modification to whole list */ 
				  for(m=0;m<temp_counter;m++)
				    {
				      for(b=0;b<MAX_ELEMENTS;b++) temp_list[m].fragment[b]+=position_modification[l][b]; /* add/subtract modification */
				      sprintf(temp_list[m].modifications,"%s%s@%i ",temp_list[m].modifications,position_modification_name[l],position_modification_at[l]);
				    }
				}
			      }
			  }
		      }
		    
		    /* ONCE FOR EACH FRAGMENT (I.E. TEMP_LIST): */
		    
		    if(X==0) 
		      {
			//printf("temp_counter = %i\n",temp_counter);
			for(m=0;m<temp_counter;m++)
			  {
			    temp_list[m].fragment[0]+=2; temp_list[m].fragment[3]++; /* add H2O (always) */
			    
				/* calculate monoisotopic mass w/o H+ based on molecular formula */ 
			    
			    temp_list[m].mass=0.0;
			    for(b=0;b<MAX_ELEMENTS;b++) temp_list[m].mass+=temp_list[m].fragment[b]*mimass[b];		  
			  }
			
			for(m=0;m<temp_counter;m++) {
			  fragment_list[n+m].begin=temp_list[m].begin;
			  fragment_list[n+m].end=temp_list[m].end;
			  strcpy(fragment_list[n+m].sequence,temp_list[m].sequence);
			  strcpy(fragment_list[n+m].modifications,temp_list[m].modifications);
			  for(temp=0;temp<MAX_ELEMENTS;temp++) fragment_list[n+m].fragment[temp]=temp_list[m].fragment[temp];
			  fragment_list[n+m].mass=temp_list[m].mass;
			}			
			
			n+=temp_counter;
		      }
		  }
	      }
	    }
	}
      
    

      n_fragments=n;
      printf("generated %i fragments\n",n);



      /* OUTPUT GENERATED LIST */

      /* open .dig file for output (monoisotopic neutral mass) */
      p=strtok(sequence_file[sequence_counter],". \n");
      strcpy(outfile,p);
      strcat(outfile,".dig"); /* 5.FASTA -> 5.dig */ printf("opening file %s for output\n",outfile);
      if ((outp = fopen(outfile, "w"))==NULL) {printf("error opening output file\n");return -1;}
      for(i=0;i<n_fragments;i++)
	fprintf(outp,"%i %i %s %f %s\n",fragment_list[i].begin+1,fragment_list[i].end+1,fragment_list[i].sequence,fragment_list[i].mass,fragment_list[i].modifications);
      close(outp);

     

      for(i=0;i<n_fragments;i++)
	{
	  sort_list[i].index=i;
	  sort_list[i].mass=fragment_list[i].mass;
	}

      quicksort(sort_list,0,n_fragments-1); /* sort list according to mass */


      /* open .mih file for output (sorted monoisotopic singly protonated mass) */
      
      strcpy(outfile,p);
      strcat(outfile,".mih"); /* 5.FASTA -> 5.mih */ printf("opening file %s for output\n",outfile);
      if ((outp = fopen(outfile, "w"))==NULL) {printf("error opening output file\n");return -1;}
      for(i=0;i<n_fragments;i++)
      	fprintf(outp,"%i %i %f\n",fragment_list[sort_list[i].index].begin+1,fragment_list[sort_list[i].index].end+1,fragment_list[sort_list[i].index].mass+HPLUS_MASS);
      close(outp);
    }  
      
      /* cleanup */
      return 0;
}
