/*                                                                                                 */
/* msfilter - calculate all possible masses under given constraints and use this as filter for     */
/* for removing noise from highly accurate, singly charge FTICRMS data                             */
/*                                                                                                 */
/* (c) Magnus Palmblad and Bo Blanckenburg, LUMC 2009-                                             */
/*                                                                                                 */
/* Thanks to Dr. Alex Henneman for debugging the code and compilation under Linux                  */
/*                                                                                                 */
/* This program is free software; you can redistribute it and/or modify it under the terms of the  */
/* Creative Commons Attribution-Share Alike 3.0 License                                            */
/* (http://creativecommons.org/licenses/by-sa/3.0/)                                                */
/*                                                                                                 */
/* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;       */
/* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.       */
/*                                                                                                 */
/* Contact information: n.m.palmblad@lumc.nl/magnus.palmblad@gmail.com                             */
/*                                                                                                 */
/*                                                                                                 */
/* compile with gcc -g -o msfilter msfilter.c -std=c99 -lm                                         */
/*                                                                                                 */
/*                                                                                                 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#define me 0.00054857990943
#define N_ELEMENTS 8
#define SF 100000 /* scaling between vector indices and m/z (or mass, since z=1) */

int main(int argc, char *argv[]) 
{
  /* monoisotopic masses of H, C, N, O, Na, P, S and K - extend to other elements as necessary: */
  double am[N_ELEMENTS]={1.0078250321,12,14.0030740052,15.9949146221,22.9897697,30.9737615,31.97207069,38.9637069}; 
  double mm, lower_mass, upper_mass, mme, step;
  int lc[N_ELEMENTS], lc_max[N_ELEMENTS], nm, DBE;  /* nm = nominal mass (for calculating N-rule etc.), DBE = Double Bond Equivalent */
  long i, j, filter_counter, last_j, output_counter, spectrum_size;
  char *V; /* frequency vector for exact molecular masses */
  double *xy_mz, *F_mz, *output_mz;
  long *xy_intensity, *output_intensity;
  char *p, line[512], input_filename[512], output_filename[512], filter_filename[512], WRITE_FILTER=0, MINIMUM_FILTER=0;
  FILE *inp, *outp;
  

  /* parsing command line parameters */
  
  if (argc<4 || argc>14)
    {
      printf("Usage: msfilter -l <lower mass> -u <upper mass> -i <input file> -o <output file> -e <mass measurement error in ppm> [-w <filter output file> -M (applies minimum, lossy, filter)]\n");
      return 0;
    }

  for(i=1;i<argc;i++) {
    if( (argv[i][0]=='-') && (argv[i][1]=='l') ) lower_mass=atof(&argv[strlen(argv[i])>2?i:i+1][strlen(argv[i])>2?2:0]);
    if( (argv[i][0]=='-') && (argv[i][1]=='u') ) upper_mass=atof(&argv[strlen(argv[i])>2?i:i+1][strlen(argv[i])>2?2:0]);
    if( (argv[i][0]=='-') && (argv[i][1]=='i') ) strcpy(input_filename,&argv[strlen(argv[i])>2?i:i+1][strlen(argv[i])>2?2:0]);
    if( (argv[i][0]=='-') && (argv[i][1]=='o') ) strcpy(output_filename,&argv[strlen(argv[i])>2?i:i+1][strlen(argv[i])>2?2:0]);
    if( (argv[i][0]=='-') && (argv[i][1]=='e') ) mme=1E-6*atof(&argv[strlen(argv[i])>2?i:i+1][strlen(argv[i])>2?2:0]);
    if( (argv[i][0]=='-') && (argv[i][1]=='w') ) {strcpy(filter_filename,&argv[strlen(argv[i])>2?i:i+1][strlen(argv[i])>2?2:0]); WRITE_FILTER=1;}
    if( (argv[i][0]=='-') && (argv[i][1]=='M') ) MINIMUM_FILTER=1;
  }


  /* allocating memory */

  printf("allocating memory..."); fflush(stdout);
  V=malloc(sizeof(char)*(long)ceil(upper_mass)*SF);
  F_mz=malloc(sizeof(double)*(long)ceil(upper_mass)*SF); /* this is larger than it needs to be */
  xy_mz=malloc(sizeof(double)*9E6); /* this is larger than it needs to be */
  xy_intensity=malloc(sizeof(long)*9E6); /* this is larger than it needs to be */
  output_mz=malloc(sizeof(double)*9E6); /* this is larger than it needs to be */
  output_intensity=malloc(sizeof(long)*9E6); /* this is larger than it needs to be */
  for(i=lower_mass*SF;i<upper_mass*SF;i++) V[i]=0;
  printf("OK (allocated %i bytes)\n",sizeof(char)*(long)ceil(upper_mass)*SF);


  /* generate filter */

  printf("generating filter..."); fflush(stdout);


  if (!MINIMUM_FILTER)
    {
      for(i=0;i<N_ELEMENTS;i++)
	{
	  lc_max[i]=floor(upper_mass/am[i]+0.5); 
	  // if(i<N_ELEMENTS-1) printf("lc[%i]=%i, ",i,lc_max[i]); else printf("lc[%i]=%i)...",i,lc_max[i]);
	}
      fflush(stdout);
      
      
      /* add constraints on elemental compositon */
      
      lc_max[4]=1; /* max 1 Na */
      lc_max[5]=1; /* max 1 P */
      lc_max[6]=2; /* max 2 S */
      lc_max[7]=1; /* max 1 K */
      
      
      /* loop through all possible elemental compositions and calculate molecular masses */
      
      for(lc[0]=0;lc[0]<lc_max[0];lc[0]++) 
	{
	  for(lc[1]=0;lc[1]<lc_max[1];lc[1]++) 
	    {
	      mm=lc[0]*am[0]+lc[1]*am[1]; if(mm>upper_mass) continue;
	      for(lc[2]=0;lc[2]<lc_max[2];lc[2]++) 
		{
		  mm=lc[0]*am[0]+lc[1]*am[1]+lc[2]*am[2]; if(mm>upper_mass) continue;
		  for(lc[3]=0;lc[3]<lc_max[3];lc[3]++) 
		    {
		      mm=lc[0]*am[0]+lc[1]*am[1]+lc[2]*am[2]+lc[3]*am[3]; if(mm>upper_mass) continue;
		      for(lc[4]=0;lc[4]<lc_max[4];lc[4]++) 
			{
			  mm=lc[0]*am[0]+lc[1]*am[1]+lc[2]*am[2]+lc[3]*am[3]+lc[4]*am[4];
			  nm=lc[0]+lc[1]*12+lc[2]*14+lc[3]*16+lc[4]*23+lc[5]*31+lc[6]*32+lc[3]*39; /* nominal masses for H, C, N, O and S */
			  
			  if(mm<upper_mass) 
			    {
			      if((!(lc[5]%2))&((nm%2)^(lc[2]%2)) || (lc[5]%2)&((!(nm%2))^(lc[2]%2))) /* inverted nitrogen rule for +1 charged species with H+ (1), Na+ (23) or K+ (39) */
				{
				  DBE=((2*lc[1]+2)-lc[0])/2; /* Double Bond Equivalent (saturation) */
				  if((DBE>-10)&&(DBE<(0.06*mm+10))) /* see http://fiehnlab.ucdavis.edu/projects/Seven_Golden_Rules/Ring-Double-Bonds/ */
				    {
				      mm-=me; /* subtract electron mass for +1 charge */
				      V[(long)floor(SF*mm+0.5)]++;
				    }
				}
			    }
			}
		    }
		}
	    }
	}

      printf("OK (generated lossless filter)\n"); fflush(stdout);
    }

  if (MINIMUM_FILTER) /* "minimum" dense filter based on experimental observation of mass defects */
    {
      step=0.00001; /* or 1.0/SF */
      for(mm=lower_mass;mm<=upper_mass;mm+=step)
	{
	  if( ((mm-floor(mm))>0.00025*mm-0.10) && ((mm-floor(mm))<0.0005*mm+0.12) ) V[(long)floor(SF*mm+0.5)]=1;
	}

      printf("OK (generated minimum, lossy, filter)\n"); fflush(stdout);
    }


  /* read input from *.xy file */
  
  if ((inp=fopen(input_filename, "r"))==NULL) {printf("error opening input file %s\n",input_filename);return -1;}
  printf("reading input from %s...",input_filename);fflush(stdout);
  
  i=0;
  while (fgets(line, 100, inp) != NULL)
    {
      if (strcmp(line,"\n")==0) continue;
      p=strtok(line," ");   
      if ( (atof(p)>lower_mass) && (atof(p)<upper_mass) )
	{
	  xy_mz[i]=(double)atof(p);
	  p=strtok('\0'," "); 
	  xy_intensity[i]=(long)atoi(p); 
	  i++;
	}
    }
  
  spectrum_size=i;
  fclose(inp);
  printf("OK (%i datapoints read)\n",spectrum_size); fflush(stdout);


  /* make filter vector (only non-zero elements) */
  
  printf("making filter vector..."); fflush(stdout);

  j=0;
  for(i=(long)floor(lower_mass*SF);i<(long)ceil(upper_mass*SF);i++) 
    {
      if(V[i]) {F_mz[j]=(double)i/(double)SF; j++;} /* keep only non-zero elements */
    }
  filter_counter=j;
  
  printf("OK (filter contains %i elements)\n",filter_counter); fflush(stdout);


  /* filter data */

  printf("filtering data (mme=%1.6f)",mme); fflush(stdout);
  
  j=0; output_counter=0; last_j=0;
  for(i=0;i<spectrum_size;i++) 
    {
      if (!(i%100000)) printf("."); fflush(stdout);
      for(j=last_j;j<filter_counter;j++)
	{
	  if (F_mz[j]>(xy_mz[i]*(1.0+mme))) break; /* we have moved too far already */
	  if ((fabs(F_mz[j]-xy_mz[i])/xy_mz[i])<mme) /* are we within mme from a possible 1+ m/z? */
	    {
	      output_mz[output_counter]=xy_mz[i];
	      output_intensity[output_counter]=xy_intensity[i];
	      output_counter++;
	      last_j=j;
	      break;
	    }
	}
    }
  printf("OK\n"); fflush(stdout);


  /* write filtered output to file */
  
  if ((outp=fopen(output_filename, "w+"))==NULL) {printf("error opening output file %s\n",output_filename);return -1;}
  printf("writing output to %s...",output_filename); fflush(stdout);
  for(i=0;i<output_counter;i++)
    {
      fprintf(outp,"%5.6f %i\n",(double)output_mz[i],output_intensity[i]); /* write filtered spectrum */
    }

  fclose(outp);
  printf("OK\n"); fflush(stdout);


  if (WRITE_FILTER)
    {
      /* write frequency vector (filter) to file */
      
      if ((outp=fopen(filter_filename, "w"))==NULL) {printf("error opening filter output file %s\n",filter_filename);return -1;}
      printf("writing filter to %s...",filter_filename);fflush(stdout);
      
      for(i=lower_mass*SF;i<upper_mass*SF;i++) 
	{
	  if(V[i]) fprintf(outp,"%5.6f %i\n",(double)i/SF,V[i]); /* write only non-zero elements */
	}
      
      fclose(outp);
      printf("OK\n"); fflush(stdout);
    }


  /* free memory */
  
  free(V);
  free(F_mz);
  free(xy_mz);
  free(xy_intensity);
  free(output_mz);
  free(output_intensity);
  
  return 0;
}
