/* GNU polyxmass - the massist's program.
   -------------------------------------- 
   Copyright (C) 2000,2001,2002,2003,2004 Filippo Rusconi

   http://www.polyxmass.org

   This file is part of the "GNU polyxmass" project.
   
   The "GNU polyxmass" project is an official GNU project package (see
   www.gnu.org) released ---in its entirety--- under the GNU General
   Public License and was started at the Centre National de la
   Recherche Scientifique (FRANCE), that granted me the formal
   authorization to publish it under this Free Software License.

   This software is free software; you can redistribute it and/or
   modify it under the terms of the GNU  General Public
   License as published by the Free Software Foundation; either
   version 2 of the License, or (at your option) any later version.
   
   This software is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.
   
   You should have received a copy of the GNU  General Public
   License along with this software; if not, write to the
   Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
   Boston, MA 02110-1301, USA.
*/

#include "pxmchem-cleave.h"
#include "libpolyxmass-plugin.h"
#include "pxmchem-formula.h"
#include "pxmchem-polchemdef.h"
#include "pxmchem-masscalc.h"
#include "pxmchem-monomer.h"



/* File-scope global variables needed for the cleavage (partial
   cleavage stuff specifically) routines.
*/
gint partial = 0;

gint le_idx = 0;
gint re_idx = 0;

gint current = 0;





/* returns the number of oligomers generated, or -1 if an error
   occurred.
 */
gint
pxmchem_cleave_polymer (PxmPolymer *polymer,
			GPtrArray *GPA,
			PxmPolchemdef *polchemdef,			
			PxmCleaveOpt *cleaveopt, 
			PxmCalcOpt *calcopt,
			PxmIonizerule *ionizerule /* can be set by
						     the caller to
						     the
						     polchemdef->ionizerule */)
{
  gint codelen = 1;
  gint count = 0;
  

  static GArray *cleaveidxGA = NULL;
  static GArray *nocleaveidxGA = NULL;
  
  GPtrArray *cleavesite_GPA = NULL;
  

  g_assert (polymer != NULL && polymer->monomerGPA != NULL);

  g_assert (polchemdef != NULL);
  codelen = polchemdef->codelen;
  
  g_assert (GPA != NULL);
  g_assert (cleaveopt != NULL);
  g_assert (ionizerule != NULL);

  
  /* The way we cleave a polymer sequence is by having the GPtrArray
     of monomers iterated into so that proper motifs corresponding to
     the specificity of cleavage are detected. If the polymer we do
     get as param has an empty GPtrArray of monomers we just return
     because that means that there is just nothing to cleave.
   */
  if (polymer->monomerGPA->len <= 0)
    return 0;
  
  /* Allocate the array of cleavesites that we'll fill with the
     cleavesites that are parsed out from the cleavespec pattern.
  */
  cleavesite_GPA = g_ptr_array_new ();
  
  if (-1 == 
      pxmchem_cleave_parse_cleavespec_pattern (cleaveopt->cleavespec,
					       polchemdef->delim_codes,
					       codelen))
    {
      g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
	   _("%s@%d: failed to parse cleavespec pattern: '%s'\n"),
	   __FILE__, __LINE__, cleaveopt->cleavespec->pattern);

      return -1;
    }
  
  /************************ DEBUG ****************************
   */
#if 0
  {
    PxmCleavemotif *clm = NULL;
    gint iter = 0;
    gint jter = 0;
    
    for (iter = 0; iter < cleaveopt->cleavespec->clmGPA->len; iter++)
      {
	clm = g_ptr_array_index (cleaveopt->cleavespec->clmGPA, iter);
	for (jter = 0;
	     jter < clm->motifGPA->len;
	     jter++)
	  {
	    debug_printf (("motif number: '%d' is: '%s'\n", jter,
			   (gchar *) g_ptr_array_index (clm->motifGPA,
							jter)));
	  }
      }
  }
#endif
  /***********************************************************
   */

  /* At this point we should have a correct array of cleavemotif
     instances, that in turn each contain an array of motifs. We now
     have to iterate in this array of cleavemotif instances and for
     each motif establish were such motif is located in the polymer
     sequence and if it is found to be cleaved (like for "/Lys") or
     not to be cleaved (like "-Lys/Pro").
  */
  cleaveidxGA = g_array_new (TRUE, TRUE, (guint) sizeof (gint));
  nocleaveidxGA = g_array_new (TRUE, TRUE, (guint) sizeof (gint));
  
  if (-1 == pxmchem_cleave_fill_cleave_nocleave_GA (polymer,
						    cleaveopt->cleavespec,
						    cleaveidxGA, 
						    nocleaveidxGA))
    {
      g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
	     _("%s@%d: failed to parse cleavespec pattern: '%s'\n"),
	     __FILE__, __LINE__, cleaveopt->cleavespec->pattern);
      
      g_array_free (cleaveidxGA, TRUE);
      g_array_free (nocleaveidxGA, TRUE);
      
      return -1;
    }

  if (-1 == pxmchem_cleave_remove_nocleave_GA (cleaveidxGA, 
						   nocleaveidxGA))
    {
      g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
	   _("%s@%d: failed to remove not-for-cleave positions\n"),
	   __FILE__, __LINE__);

      g_array_free (cleaveidxGA, TRUE);
      g_array_free (nocleaveidxGA, TRUE);

      return -1;
    }

  g_array_sort (cleaveidxGA, pxmchem_cleave_sort_increasing_cleave_GA);
  
#if 0
  {
    gint iter_debug = 0;
    
    for (iter_debug = 0; iter_debug < cleaveidxGA->len; iter_debug++)
      {
	printf ("current index is %d\n", g_array_index (cleaveidxGA,
							gint,
							iter_debug));
      }
  }
#endif


  /* We now have an array containing all the positions where cleavage
     should occur. We should now construct oligomers according to
     these positions. Note that GPA, the array passed as parameter is
     an array that will hold NOT oligomers BUT GPtrArrays of
     oligomers. One array of oligomers is going to be allocated for
     each partial cleavage that is asked in cleaveopt. Once this array
     is successfully filled with oligomers, it is going to be added to
     GPA.
  */

  count = pxmchem_cleave_fill_oligomer_GPA (polymer,
					    GPA,
					    cleaveidxGA,
					    polchemdef,
					    cleaveopt,
					    calcopt,
					    ionizerule);
  if (count == -1)
    {
      g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
	     _("%s@%d: failed to fill the array of arrays of oligomers\n"),
	     __FILE__, __LINE__);
      
      g_array_free (cleaveidxGA, TRUE);
      g_array_free (nocleaveidxGA, TRUE);
      
      return -1;
    }
  
  /* Now that all the work was done, we can free the arrays that were
     only instrumental in the cleavage calculatation.
  */
  g_array_free (cleaveidxGA, TRUE);
  g_array_free (nocleaveidxGA, TRUE);
  
  return count;
}





gint
pxmchem_cleave_parse_cleavespec_pattern (PxmCleaveSpec *cls,
					 gchar *delim_codes,
					 gint codelen)
{
  /* We get a cleavespec object in which the 'pattern' should be
   * non-NULL.  This pattern member is a ';'-delimited string, in
   * which each sub-string is a 'site'. Each site is in turn
   * constituted by a motif and a '/' that indicates where the motif
   * is actually cleaved. For example the "-Lys/Pro" site is actually
   * a motif of sequence "LysPro" and the site holds two more
   * informations with respect to the mere motif: it says that the
   * motif should not be cleaved ('-') and that if the '-' were not
   * there, the cleavage would occur between the Lys and the Pro ('/'
   * symbolizes the cleavage).
   *
   * For example, if the cleavespec had a "Lys/;Arg/;-Lys/Pro" string,
   * it would be split into 3 strings: "Lys/" and "Arg/" and
   * "-Lys/Pro" (these are 'site' strings). These three site string
   * would further be deconstructed into motif string (removal of '-'
   * and '/' characters). Where would these 3 motif strings be stored?
   * They would be set into one cleavemotif instance for each
   * motif. Thus, for example, "-Lys/Pro" would yield a cleavemotif of
   * 'motif' LysPro, with a FALSE cleave member and a 1 offset member.
   *
   * Will return the number of cleavemotif instances that were created.
   * Upon error -1 is returned.
   */
  PxmCleavemotif *clm = NULL;
  
  gchar **sites = NULL;
  
  gchar *left = NULL;
  gchar *right = NULL;
  gchar *cur = NULL;
  
  gint iter = 0;
  gint result = -1;
    
  g_assert (cls != NULL);
  g_assert (cls->clmGPA != NULL);

  g_assert (delim_codes != NULL);
  
  
  /* Make sure that the pattern in the cls is at least two characters
   * wide: "K/".
   */
  if (strlen (cls->pattern) < 2)
    g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
	   _("%s@%d: cleavage pattern is invalid: '%s'\n"),
	   __FILE__, __LINE__, cls->pattern);

  /* There can be more than one site in a pattern, and each different
   * site should be separated from the other by a ';' character. Use
   * this ';' character to split the pattern into site motifs.
   */
  sites = g_strsplit (cls->pattern, ";", 0 /*strip totally*/);
  
  /* At this point, if we had a pattern "Lys/;Arg/;-Lys/Pro", 
   * we would have 
   * sites[0]="Lys/"
   * sites[1]="Arg"
   * sites[2]="Lys/Pro"
   */

  /* For each element in sites we still have to convert them into a
   * cleavemotif object. Each cleavemotif will have one motif string,
   * one array of monomer code strings, one offset and one boolean
   * cleave.
   */
  
  while (sites [iter] != NULL)
    {
      cur = g_strdup (sites [iter]);
      
      /* The site length cannot be less than 2: the minimum site 
       * cannot be less than something like "X/" or "/X".
       */
      if (strlen (cur) < 2)
	{
	  g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
		 _("%s@%d: this cleavage site is invalid: '%s'\n"),
		 __FILE__, __LINE__, cur);
	  
	  g_free (cur);
	  
	  g_strfreev (sites);
	  
	  return -1;
	}
      
      /* We want one '/' exactly. 1 and no more no less.
       */
      left = (gchar *) strchr (cur, '/');
      
      if (left == NULL)
	{
	  g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
		 _("%s@%d: this cleavage site is invalid: '%s'\n"),
		 __FILE__, __LINE__, cur);
	  
	  g_free (cur);
	  
	  g_strfreev (sites);
	  
	  return -1;
	}
      
      /* Now, see if there are more than one '/' in the site.
       */
      right = (gchar *) strrchr (cur, '/');
      
      if (left != right)
	{
	  g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
		 _("%s@%d: this cleavage site is invalid: '%s'\n"),
		 __FILE__, __LINE__, cur);
	  
	  g_free (cur);
	  
	  g_strfreev (sites);
	  
	  return -1;
	}
      
      /* Now strip all the spaces from the site, be them internal
       * or in the lead/trail. cur is freed and reallocated in the 
       * process.
       */
      cur = libpolyxmass_globals_unspacify_string (&cur);
      
      /* We may now check if the site is for a cleavage or not.
       * This means that if there is a '-', it should be the first
       * character, and then we understand that this is not for cleave.
       */
      left = (gchar *) strchr (cur, '-');
      
      if (left != NULL)
	{
	  /* A '-' was found, it must be in the first position of the 
	   * string.
	   */
	  if (cur [0] != '-')
	    {
	      g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
		     _("%s@%d: this cleavage site is invalid: '%s'\n"),
		     __FILE__, __LINE__, cur);
	  
	      g_free (cur);
	  
	      g_strfreev (sites);
	  
	      return -1;
	    }
	  
	  /* We want to remove the '-' and to do this we just
	   * move all the subsequent characters one position left.
	   */
	  g_memmove (cur, left + sizeof (gchar),
		     strlen (left + sizeof (gchar)) + 1);
	  
	  /* Now, starting from something like "-Lys/Pro", we would
	   * get "Lys/Pro" (exit the '-' sign).
	   */
	}
      
      /* We know know if the site was for cleavage or not. So now we
       * have to allocate the cleavemotif object in order to start
       * fillings its data proper.
       */
      clm = pxmchem_cleavemotif_new ();
      pxmchem_cleavemotif_set_cleave (clm, left == NULL ? TRUE : FALSE);
      
      /* Now that a number of basic checks have been done, we can
       * actually parse the site into cleavemotif data.
       */
      result = 
	pxmchem_cleave_parse_cleavage_site (cur, clm, delim_codes,
						codelen);

      if (result == -1)
	{
	  g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
		 _("%s@%d: failed to parse the cleavage site: '%s'\n"),
		 __FILE__, __LINE__, cur);
	  
	  
	  g_free (cur);
	  
	  pxmchem_cleavemotif_free (clm);
	  
	  g_strfreev (sites);
	  
	  return -1;
	}
      
      /* At this point, the clm should be complete. However, we
       * should be able to make some checks:
       */
      /* First, we know that result should equal the number of monomer
       * codes successfully parsed in the site string. This value
       * should thus be identical to the number of monomer codes in
       * the motifGPA array.
       */
      g_assert (result == clm->motifGPA->len);

      /* Each cleavemotif is new'ed with a -1 value, so that checkings 
       * can be performed. Here we do such check.
       */
      g_assert (clm->offset >= 0);
      
      /* Now that the clm cleavemotif seems to be correct, we just
       * append it to the array of cleavemotif instances that belong
       * to the cleavespec 'cls' passed as param.
       */
      g_ptr_array_add (cls->clmGPA, clm);
      
      /* Now continue iterating in the array of site 'sites'. First
       * free some resources.
       */
      g_free (cur);
      
      /* Increment the variable iter so that we jump to next site in
       * the array 'sites'.
       */
      iter++;
    }
  /* end of while (sites [iter] != NULL)
   */

  /* Now that all the 'sites' has been dealt with, we should free it.
   */
  g_strfreev (sites);
  
  return cls->clmGPA->len;
}


gint
pxmchem_cleave_parse_cleavage_site (gchar *site,
				    PxmCleavemotif *clm,
				    gchar *delim_codes,
				    gint codelen)
{
  /* We get something like "Lys/Pro" or something like "KKGK/RRGK" and
   * we have to make three things: 
   * 1. change the site "KKGK/RRGK" to a motif string.
   * 2. set the clm offset member to the index of '/' in the site string.
   * 3. make an array of codes with the motif.
   *
   * This function will return the number of valid monomer codes that
   * could successfully be parsed out of the 'site'. For "KKGK/RRGK",
   * the function would return 8, for example. Upon error, -1 is
   * returned.
   */
  gint idx = 0;
  gint length = 0;
  gint count = 0;
  
  
  gchar err [2];
  gchar *code = NULL;
  gchar *valid = NULL;
  gchar *delim = NULL;

  GString *gs = NULL;
  
  gsize size = 0;
    
  g_assert (site != NULL);
  g_assert (clm != NULL);
  g_assert (delim_codes != NULL);
  

  size = strlen (site);
  g_assert (size < G_MAXINT);
  length = (gint) size;
    
  /* Allocate the slot in which we'll receive each code as it is gotten
   * from the site.
   */
  code = g_malloc0 (sizeof (gchar) * codelen + 1);

  /* Allocate the GString with which we'll construct the motif
   * derived from the site. 
   */
  gs = g_string_new ("");
  
  while (1)
    {
      /* The function that we'll use must be able to calculate the
       * length of the code string made available to it, which is why
       * it requires that it be filled with 'X' characters.
       */
      memset (code, 'X', codelen);

      /* Set each byte (but the last) of the err string to \x0. This
       * 2-bytes slot will be used to get back error characters from
       * the parsed string below.
       */
      memset (err, '\x0', 2);

      /* The function call below returns -1 when it encounters a
	 non-alpha character. This may be the case if a totally insane
	 character is encountered, but it may also be the case if the
	 character is the '/' character that we need ! So we have to
	 test this by looking what is in the 'err' string. Depending
	 on that content, we decide if the returned -1 value is
	 serious or not.
      */
      if (-1 == pxmchem_monomer_extract_code_from_string (code, codelen,
							  site,
							  &idx,
							  err))
	{
	  /* The function call above encountered a character that is
	     not ascii alpha. That may be our '/' that we have to find
	     sooner or later. Test this. If it is our '/' character, 
	     then that's perfect. Otherwise that's an error.
	  */
	  if (err [0] == '/')
	    {
	      clm->offset = count;
	      
	      idx++;
	      
	      continue;
	    }
	  
	  /* It might also be a space character, in which case we
	     just ignore it.
	  */
	  else if (1 == g_ascii_isspace ((gchar) err [0]))
	    {
	      idx++;
	      
	      continue;
	    }
	  
	  else 
	    {
	      /* The error is serious, here, we just want to free
		 the allocated string and return -1.
	      */
	      g_free (code);
	      
	      return -1;
	    }
	}
      

      /* OK, now we know that there was not a serious error. But
	 there are still two cases:
	 
	 1. if the 'code' string that is returned by the function call
	 is not empty, then that's cool, that means that everything
	 was parsed correctly for this round. Thus we take our stuff
	 and continue the parsing by incrementing idx++.

	 2. if the 'code' string is returned empty, that means that we
	 reached the end of the string to parse (see that idx++ moves
	 along the string). So we just exit the loop.
      */
      if (code [0] != '\x0')
	{
	  /* 'code' contains something, check that the contents are a
	     valid monomer code by looking if code is found in the
	     'codes' parameter.
	   */
	  delim = g_strdup_printf ("%c%s%c",
				   libpolyxmass_globals_delim, 
				   code,
				   libpolyxmass_globals_delim);
	  
	  if (NULL == strstr (delim_codes, delim))
	    {
	      g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
		     _("%s@%d: monomer code is unknown: '%s'\n"),
		     __FILE__, __LINE__, code);
	      
	      g_free (code);
	      
	      return -1;
	    }
	  
	  g_free (delim);
	  
	  /* Append the code to the elongating gs, which will be put
	   * in the motif member of clm at the end of the site parsing.
	   */
	  gs = g_string_append (gs, code);
	  
	  /* Duplicate this VALID code so that we can put it into the 
	   * motifGPA of the cleavemotif.
	   */
	  valid = g_strdup (code);
	  g_ptr_array_add (clm->motifGPA, valid);

	  /* OK, apparently the returned code is valid, so we increment
	   * the variable count of valid codes read.
	   */
	  count++;

	  /* BUT-ATTENTION - we may also have a value in err, since
	   * imagine we had "M/" as a site to extract a monomer code
	   * from. Then, we would effectively parse 'M' and then parse
	   * '/', which would be put into err [0]. But while parsing
	   * '/' the iter in the
	   * pxmchem_monomer_extract_code_from_string function has
	   * also been incremented. Thus, we would have to set the
	   * clm->offset value to count.
	   */
	  if (err [0] == '/')
	    clm->offset = count;

	  idx++;
	  
	  continue;
	}
      else /* if (code [0] != '\x0') */
	{
	  /* The code that was returned is empty, which indicates that
	     we reached the end of the string. Just exit the loop.
	  */
	  break;
	}
    }
  /* end of while (1)
   */
  
  /* At this point, the parsing is finished without error, which means
   * that we can set the contents of gs to the motif member of the
   * cleavemotif instance clm.
   */
  clm->motif = gs->str;
  
  g_string_free (gs, FALSE);
  
  /* Free the code allocated string, we do not use it anymore.
   */
  g_free (code);
  
  return count;
}


/* returns the number of items put into both arrays, -1 if error.
 */
gint
pxmchem_cleave_fill_cleave_nocleave_GA (PxmPolymer *polymer,
					PxmCleaveSpec *cleavespec,
					GArray *cleaveGA, 
					GArray *nocleaveGA)
{
  gint count = 0;
  gint iter = 0;
  gint cleave_idx = 0;
  
  static gint mnm_idx = 0;
  
  PxmCleavemotif *cleavemotif = NULL;
  
  
  g_assert (polymer != NULL);
  g_assert (polymer->monomerGPA != NULL);
  
  g_assert (cleavespec != NULL);
  
  g_assert (cleaveGA != NULL);
  g_assert (nocleaveGA != NULL);
  

  /* We get a pointer to a cleavespec object that contains a gptrarray
     of cleavemotif instances into which we will iterate in order 
     to search for motifs in the polymer sequence.
  */
  for (iter = 0; iter < cleavespec->clmGPA->len; iter++)
    {
      cleavemotif = g_ptr_array_index (cleavespec->clmGPA, iter);
      
      mnm_idx = -1;
      
      /* Go into an infinite loop, from which we'll exit when the return
	 value from the  () is -1.
      */
      while (1)
	{
	  mnm_idx = 
	    pxmchem_cleave_find_next_cleavemotif (cleavemotif,
						  polymer->monomerGPA,
						  mnm_idx + 1);
	  
	  if (mnm_idx == -1)
	    /* The motif represented by cleavemotif->motifGPA was not found
	       in the polymer sequence represented by polymer->monomerGPA.
	    */
	    break;
	  
	  /*
	    debug_printf (("found new cleavemotif at index %d\n", mnm_idx));
	  */

      
	  /* Do not forget: The position at which the motif is found
	     in the polymer sequence is not necessarily the position
	     at which the cleavage will effectively occur. Indeed,
	     let's say that we found such motif in the polymer
	     sequence: "KKRKGP". This motif was extracted from a
	     cleavespec that had a pattern like this: "KKRK/GP". What
	     we see here is that the cleavage occurs after the fourth
	     monomer! And we must realize that the mnm_idx returned
	     above corresponds to the index of the first 'K' in
	     "KKRKGP" motif that was found in the polymer
	     sequence. Thus we have to take into account the offset
	     (+4, in our example, WHICH IS A POSITION and not an
	     index, which is why we need to remove 1 below) of the
	     cleavage:
	  */
	  cleave_idx = mnm_idx + cleavemotif->offset -1;
  
	  /* Test two special cases: 1) if the cleave_idx is less than
	     0, then continue. If cleave_idx greater or eaqual to the
	     index of the last monomer in the polymer sequence, than
	     it is obvious that we do not need to proceed!
	  */
	  if (cleave_idx < 0)
	    continue;
	  if (cleave_idx >= polymer->monomerGPA->len -1)
	    break;
      
	  /* A motif was found in the polymer sequence, so we may continue
	     the processing. In particular, check if the currently
	     iterated cleavemotif is for cleavage or not for cleavage, so
	     that we know in which array to put the newly found mnm_idx.
	  */
	  if (TRUE == cleavemotif->cleave)
	    {
	      cleaveGA = g_array_append_val (cleaveGA, cleave_idx);
	      count++;
	    }
	  else
	    {
	      nocleaveGA = g_array_append_val (nocleaveGA, cleave_idx);
	      count++;
	    }
	}
      /* end of
	 while (1)
      */ 
    }
  /* end of 
     for (iter = 0; iter < cleavespec->clmGPA->len; iter++)
  */
  
  return count;
}


/* returns the number of items removed from cleaveGA, -1 if error.
 */
gint
pxmchem_cleave_remove_nocleave_GA (GArray *cleaveGA, GArray *nocleaveGA)
{
  gint count = 0;
  
  gint iter_cleave = 0;
  gint iter_nocleave = 0;
  
  gint iterated_cleave = 0;
  gint iterated_nocleave = 0;
  

  g_assert (cleaveGA != NULL);
  g_assert (nocleaveGA != NULL);

  for (iter_nocleave = 0; iter_nocleave < nocleaveGA->len; iter_nocleave++)
    {
      iterated_nocleave = g_array_index (nocleaveGA, gint, iter_nocleave);
      
      for (iter_cleave = 0; iter_cleave < cleaveGA->len; iter_cleave++)
	{
	  iterated_cleave = g_array_index (cleaveGA, gint, iter_cleave);
	  
	  if (iterated_nocleave == iterated_cleave)
	    {
	      g_array_remove_index (cleaveGA, iter_cleave);
	      count++;
	    }
	}
      
      iter_cleave++;
    }
  
  return count;
}




  
gint
pxmchem_cleave_find_next_cleavemotif (PxmCleavemotif *cleavemotif,
				      GPtrArray *GPA,
				      gint idx)
{
  gint iter = 0;
  gint iter_idx = idx;
  
  gint first_idx = 0;

  gchar *first = NULL;
  gchar *next = NULL;

  gboolean not_good = FALSE;

  PxmMonomer *monomer = NULL;
  

  g_assert (cleavemotif != NULL);
  g_assert (cleavemotif->motifGPA != NULL);
  
  g_assert (GPA != NULL);

  g_assert (idx < GPA->len);
  g_assert (idx >= 0);
  

  /* We get a pointer to a GPtrArray of monomer instances in which we
     must iterate in search for a stretch of monomers that correspond to
     the cleavemotif->motifGPA items.
     
     This means that if 
     cleavemotif->motifGPA [0] = "Lys"
     cleavemotif->motifGPA [1] = "Pro"

     we want to search in GPA the same sequence by iterating in this GPA
     from index idx onwards, and we stop searching when the GPA's end is
     found or if GPA [n] = "Lys" and GPA [n+1] = "Pro".
  */

  if (GPA->len <= 0)
    return 0;

  if (cleavemotif->motifGPA->len <= 0)
    return 0;
  
  
  
  /* Seed the routine by setting 'first' to the first motif in the
     cleavemotif->motifGPA (in our example this is "Lys").
  */
  first = g_ptr_array_index (cleavemotif->motifGPA, 0);
  
  /* And now iterate (starting from 'idx') in the polymer sequence's
   * array ('GPA') in search for a monomer having the proper code
   * ("Lys").
   */
  while (iter_idx < GPA->len) 
    {
      monomer = g_ptr_array_index (GPA, iter_idx);
      
      if (0 == strcmp (monomer->code, first))
	{
	  /* A monomer is found that has the same code as
	     'first'. That's a reather encouraging starting point, but
	     we now have to assess if the following monomer codes also
	     match the ones in motifGPA.
	   */
	  first_idx = iter_idx;
	  not_good = FALSE;
	  
	  /* We iterate in the motifGPA and make sure that each
	     searched monomer code is indeed found at the right
	     location in the polymer sequence ('GPA').
	  */
	  for (iter = 1; iter < cleavemotif->motifGPA->len; iter++)
	    {
	      /* Make sure that we are not going to try to access the GPA
		 outside of its boundaries.
	      */
	      if (iter_idx + iter >= GPA->len)
		{
		  not_good = TRUE;
		  break;
		}
	      
	      /* What's the next monomer code in the motif?
	       */
	      next = g_ptr_array_index (cleavemotif->motifGPA, iter);
	      
	      /* What's the code of the next monomer in the sequence
		 array?
	       */
	      monomer = g_ptr_array_index (GPA, iter_idx + iter);
	      
	      /* Do these codes match?
	       */
	      if (0 == strcmp (monomer->code, next))
		continue;
	      else
		{
		  /* No they do not match, we just have to break the loop
		     and let the outside code know that they did not match.
		  */
		  not_good = TRUE;
		  break;
		}
	    }
	  /* end of
	     for (iter = 1; iter < cleavemotif->motifGPA; iter++)
	  */

	  if (not_good == TRUE)
	    {
	      /* The previous monomer stretch in the polymer sequence
		 was found not to match the motif searched for, but we
		 still may have another sequence portion that matches
		 the motif, downstream of the current sequence
		 position, so just continue iterating in the polymer
		 sequence in search for the motif.
	      */
	      iter_idx++;
	      continue;
	    }
	  else
	    {
	      /* The previous for loop was terminated, not because no
		 similarity was found between the searched motif and a
		 polymer sequence stretch (otherwise not_good would
		 have been FALSE), but because we ended iterating in
		 the motifGPA array representing the motif to be
		 searched for in the polymer sequence. We just are
		 happy to return the index of the first monomer of the
		 polymer sequence matching the motif that had been
		 searched.
	      */
	      return first_idx;
	    }
	}
      
      iter_idx++;
    }
  /* end of 
     while (iter_idx < GPA->len) 
  */

  /* At this point, we did not find any corresponding motif in the
     polymer sequence, so return -1 as the monomer index.
  */
  return -1;
}


gint
pxmchem_cleave_sort_increasing_cleave_GA (gconstpointer a,
					  gconstpointer b)
{
  if (*(gint*)a < *(gint*)b)
    return -1;
  else if (*(gint*)a > *(gint*)b)
    return +1;
  else
    return 0;
}


/* Returns the number of the total oligomers that were generated
   (in all the partial cleavages).
*/
gint
pxmchem_cleave_fill_oligomer_GPA (PxmPolymer *polymer,
				  GPtrArray *GPA,
				  GArray *cleaveidxGA,
				  PxmPolchemdef *polchemdef,
				  PxmCleaveOpt *cleaveopt,
				  PxmCalcOpt *calcopt,
				  PxmIonizerule *ionizerule)
{
  gint count = 0;
  
  gint length = 0;

  gint iter_cut = 0;
  gint part_cut = 0;

  PxmIonizerule *ionizerule_local = NULL;
  
  
  gboolean oligomer_is_polymer = FALSE;
  
  GPtrArray *partGPA = NULL;
  
  PxmOligomer *oligomer = NULL;
  


  g_assert (polymer != NULL);
  length = polymer->monomerGPA->len;

  g_assert (GPA != NULL);
  g_assert (cleaveidxGA != NULL);
  g_assert (cleaveopt != NULL);


  if (cleaveidxGA->len <= 0)
    return 0;
  
  if (length <= 0)
    return 0;
  
  /* Note that the function can be called with an ionizerule object. If
     this pointer is NULL, then we use the polchemdef->ionizerule.
  */
  if (ionizerule == NULL)
    ionizerule_local = polchemdef->ionizerule;
  else
    ionizerule_local = ionizerule;
  
  /* We generated as may arrays of oligomer as required in the number
     of partial cleavages asked in cleaveopt->partial. For each
     partial cleavage (starting from default partial 0) we do allocate
     a new oligomer GPtrArray and fill it with the relevant oligomers.
     Once a partial cleavage is done dealing with, we add its related
     GPtrArray of oligomers to the GPA.
  */
  for (partial = 0; partial <= cleaveopt->partial; partial++)
    {
      iter_cut = 0;
      
      /* le_idx is the index of the monomer at the left end of the
	 current oligomer made.
      */
      le_idx = 0;
      re_idx = 0;
      
      oligomer_is_polymer = FALSE;
      
      /* Each time we start a new partial cleavage round we have to
	 allocate a new GPtrArray of oligomers to accomodate the
	 generated oligomers into it. We add this new GPtrArray to the
	 GPA that is passed as parameter to this function.
      */
      partGPA = g_ptr_array_new ();
      g_ptr_array_add (GPA, partGPA);
      
      for (iter_cut = 0; iter_cut < cleaveidxGA->len; iter_cut++)
	{
	  part_cut = iter_cut + partial;
	  
	  /* Check that we are not trying to access an array index
	     greater than boundaries due to having a big partial
	     cleavage. If so exit the the loop and go for the right
	     end-terminal oligomer.
	  */
	  if (iter_cut < cleaveidxGA->len && part_cut >= cleaveidxGA->len)
	    {
	      /* The left end-terminal side for this oligomer is OK,
		 but the right end cannot be defined here, because we
		 cannot go into cleaveidxGA to get the position at
		 which cleavage should occur because, with the partial
		 cleavage at wich we are, we are outside of the border
		 of the array. Thus the right end monomer of the
		 currently made oligomer is the right end-terminal
		 monomer of the whole polymer.
	      */
	      
	      /* Note as a special sub-case, here, that if the partial
		 cleavage that is asked is so big that the very first
		 oligomer that we do goes immediately from the first
		 monomer to the last of the polymer sequence, all we
		 do is generate an oligomer that is identical to the
		 parent polymer. See below at end of this for loop how
		 the left end monomer index is calculated for current
		 oligomer.
	      */
	      if (iter_cut == 0)
		oligomer_is_polymer = TRUE;
	      break;
	    }
	  
	  re_idx = g_array_index (cleaveidxGA, gint, part_cut);
	  
	  oligomer = 
	    pxmchem_cleave_oligomer_new_with_options (polymer,
						      le_idx,
						      re_idx,
						      iter_cut,
						      cleaveopt,
						      calcopt,
						      ionizerule_local,
						      polchemdef);
	  
	  g_assert (oligomer != NULL);
	  
	  g_ptr_array_add (partGPA, oligomer);

	  count++;
	  
	  le_idx = g_array_index (cleaveidxGA, gint, iter_cut) + 1;
	}
      /* end of 
	 for (iter_cut = 0; iter_cut < cleaveidxGA->len; iter_cut++)
      */
      
      /* Attention, we should not forget the right end-terminal oligomer.
       */
      if (oligomer_is_polymer == TRUE)
	le_idx = 0;
      else
	le_idx = g_array_index (cleaveidxGA, gint, --iter_cut) + 1;
      
      /* The iter_cut variable is used below to construct the oligomer's
	 name string, which means that we have to increment it once
	 before calling the function, because we did not increment it
	 betwen the last-but-one and the last oligomers.
      */
      iter_cut++;
      
      /* Below, we remove 1 because the oligomer is described by using 
	 indexes and not positions.
      */
      re_idx = length - 1;

      oligomer = 
	pxmchem_cleave_oligomer_new_with_options (polymer,
						  le_idx,
						  re_idx,
						  iter_cut,
						  cleaveopt,
						  calcopt,
						  ionizerule_local,
						  polchemdef);
      
      g_assert (oligomer != NULL);
      
      g_ptr_array_add (partGPA, oligomer);
      
      count++;
    }
  /* end of 
     for (partial = 0; partial <= cleaveopt->partial; partial++)
  */

  /* At this point the GPA contains a number of pointers to GPtrArrays
     that in turn contain allocated oligomer instances.
  */
  return count;
}


PxmOligomer *
pxmchem_cleave_oligomer_new_with_options (PxmPolymer *polymer,
					  gint start_idx,
					  gint end_idx,
					  gint index,
					  PxmCleaveOpt *cleaveopt,
					  PxmCalcOpt *calcopt,
					  PxmIonizerule *ionizerule,
					  PxmPolchemdef *polchemdef)
{
  gint length = 0;
  gint iter = 0;

  gboolean error = FALSE;

  PxmMonomer *monomer = NULL;
  PxmOligomer *oligomer = NULL;
    
  PxmProp *prop_new = NULL;
  PxmProp *prop_found = NULL;
  
  
  g_assert (polymer != NULL);
  length = polymer->monomerGPA->len;

  g_assert (cleaveopt != NULL);
  g_assert (ionizerule != NULL);
  
  /* If calcopt is NULL, that means that no mass calculation is
     performed. If calcopt is NULL, polchemdef may be NULL also, since
     it is not required.
   */
  

  oligomer = pxmchem_oligomer_with_options_new (polymer,
						start_idx,
						end_idx,
						calcopt,
						ionizerule,
						cleaveopt->plm_chement,
						polchemdef);
  if (oligomer == NULL)
    {
      g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
	     _("%s@%d: failed to create oligomer.\n"),
	     __FILE__, __LINE__);
      
      return NULL;
    }
  
  /* Now that we have the oligomer, we can tailor its name:
   */
  oligomer->name = g_strdup_printf ("p%d#%d", partial, index + 1);

  /* Now that we have dealt with all the cleavage-independent bits, we
     can deal with the cleavage-specific stuff. Note that at this
     stage, the oligomer has its masses already calculated, and put
     into its member data masspair->mono and masspair->avg.
     instance as a prop->data.
  */
  if (cleaveopt->put_sequence == TRUE)
    {
      prop_new = libpolyxmass_prop_new ();
      
      libpolyxmass_prop_set_name (prop_new, "SEQUENCE");
      
      prop_new->data = 
	(gchar *) pxmchem_polymer_make_codes_string (polymer,
						     oligomer->start_idx,
						     oligomer->end_idx);
      
      g_assert (prop_new->data != NULL);
      
      g_ptr_array_add (oligomer->propGPA, prop_new);
    }
 
  if (cleaveopt->mnm_chement != PXMCHEMENT_MNM_NONE)
    {
      /* For each monomer in the polymer sequence GPtrArray of
	 monomers, we have to check if the monomer is modified with a
	 prop "MODIF". If so, we create an informative prop into the
	 oligomer.
      */
      for (iter = oligomer->start_idx; iter < oligomer->end_idx + 1; iter++)
	{
	  monomer = g_ptr_array_index (polymer->monomerGPA, iter);
	  
	  error = FALSE;
	  
	  prop_found =  libpolyxmass_prop_find_prop (monomer->propGPA,
						 NULL,
						 NULL,
						 "MODIF",
						 NULL,
						 PXM_CMP_NO_DEEP);
	  
	  if (NULL != prop_found)
	    {
	      /* The iterated monomer is modified, construct a string
		 representing the index (BOTH IN THE OLIGOMER AND IN
		 THE POLYMER!) at which the modif is found.
	      */
	      prop_new = libpolyxmass_prop_new ();
	      libpolyxmass_prop_set_name (prop_new, "POS/IDX/MODIF");
	      prop_new->data =
		g_strdup_printf ("%d/%d/%s",
				 /* pos in the oligomer */
				 iter - oligomer->start_idx + 1,
				 /* index in the sequence */
				 iter,
				 (gchar *) prop_found->data);
		  
	      g_ptr_array_add (oligomer->propGPA, prop_new);
	    }
	      
	}
      /* end of 
	 for (iter = oligomer->start_idx; 
	 iter < oligomer->end_idx, 
	 iter++)
      */
    }
  /* end of 
     if (cleaveopt->mnm_chement != PXMCHEMENT_MNM_NONE)
  */


  return oligomer;
}



      
