/* BEGIN software license
 *
 * MsXpertSuite - mass spectrometry software suite
 * -----------------------------------------------
 * Copyright(C) 2009,...,2018 Filippo Rusconi
 *
 * http://www.msxpertsuite.org
 *
 * This file is part of the MsXpertSuite project.
 *
 * The MsXpertSuite project is the successor of the massXpert project. This
 * project now includes various independent modules:
 *
 * - massXpert, model polymer chemistries and simulate mass spectrometric data;
 * - mineXpert, a powerful TIC chromatogram/mass spectrum viewer/miner;
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 *
 * END software license
 */

#include <QByteArrayView>

/////////////////////// Local includes
#include "Sequence.hpp"
#include "PolChemDef.hpp"


namespace MsXpS
{

namespace libXpertMass
{


/*!

\class MsXpS::libXpertMass::Sequence
\inmodule libXpertMass
\ingroup PolChemDefBuildingdBlocks
\inheaderfile Sequence.hpp

\brief The Sequence class provides abstractions to work with
a simple sequence of \l{Monomer}s.

A sequence of monomer can be represented in two ways:

\list
\li A string of monomer codes concatenated one to the other with no
delimitation(like "ATGC" or "AlaThrGlyCys");
\li A list of fully qualified \l{Monomer} instances allocate on the heap.
\endlist

\note The reference status of a sequence is in the form of a list of
allocated Monomer instances. The conversion to the string of monomer codes
is only a utility. When a sequence is created (with an argument that is a
string of monomer codes) the caller should ensure that the text sequence is
converted into a list of monomers prior to starting using its methods
extensively (see makeMonomerList()). Functions size() and
removeMonomerAt()) only work on a sequence in the form of a list of
\l{Monomer} instances.

Methods are provided to convert from one sequence kind
(concatenated codes) to the other sequence kind(list of Monomer
instances).

Equally interesting is the ability of the methods in this class to
be able to:

- parse the monomer sequence and to extract monomer codes one
after the other;

- remove monomers from the sequence at specified indexes;

- add monomers to the sequence at specified indexes.

However, for this rather basic class to be able to perform
interesting tasks it has to be able to know where to find polymer
chemistry definition \l{PolChemDef} data. This is possible only when a
pointer to a polymer chemistry definition is passed to the used functions.
*/


/*!
   \variable MsXpS::libXpertMass::Sequence::m_monomerText

   \brief String holding the sequence of monomer codes.
 */

/*!
   \variable MsXpS::libXpertMass::Sequence::m_monomerList

   \brief List of allocated \l Monomer instances that should match the
sequence of codes string (m_monomerText).
 */


/*!
  \brief Construct a Sequence using \a text.

  The sequence is in the form of a string of concatenated monomer
  codes. No quality check is performed.
  */
Sequence::Sequence(const QString &text) : m_monomerText(text)
{
}

/*!
  \brief Construct this Sequence as a copy of \a other.

    The copying is deep with the list of Monomer instances copied to this
Sequence.
  */
Sequence::Sequence(const Sequence &other) : m_monomerText(other.m_monomerText)
{
  for(int iter = 0; iter < other.m_monomerList.size(); ++iter)
    m_monomerList.append(new Monomer(*other.m_monomerList.at(iter)));
}

/*!
  \brief Destructs this sequence.

    The \l Monomer instances are deleted.
*/
Sequence::~Sequence()
{
  while(!m_monomerList.isEmpty())
    delete m_monomerList.takeFirst();
}

/*!
\brief Assigns \a other to this Sequence

Returns a reference to this Sequence.
*/
Sequence &
Sequence::operator=(const Sequence &other)
{
  if(&other == this)
    return *this;

  m_monomerText = other.m_monomerText;

  for(int iter = 0; iter < other.m_monomerList.size(); ++iter)
    m_monomerList.append(new Monomer(*other.m_monomerList.at(iter)));

  return *this;
}

/*!
\brief Set this Sequence' string of monomer codes to \a text.
*/
void
Sequence::setMonomerText(const QString &text)
{
  m_monomerText = text;
}

/*!
\brief Appends the \a text sequence of monomer codes to this Sequence.

No verification is performed on \a text.
*/
void
Sequence::appendMonomerText(const QString &text)
{
  if(text.isEmpty())
    return;

  m_monomerText += text;
}


/*!
\brief Returns this Sequence's string of monomer codes.
 */
const QString *
Sequence::monomerText()
{
  return &m_monomerText;
}

/*!
\brief Returns a reference to this Sequence's list of \l Monomer instances.
 */
const QList<const Monomer *> &
Sequence::monomerList() const
{
  return m_monomerList;
}


/*!
\brief Returns this Sequence's list of \l Monomer instances.
 */
QList<const Monomer *> *
Sequence::monomerListPtr()
{
  return &m_monomerList;
}

/*!
\brief Returns the size of this Sequence as the size of the list of Monomers
instances.
  */
int
Sequence::size() const
{
  return m_monomerList.size();
}

/*!
\brief Returns true if \a index is valid as an index of a Monomer instance in
this Sequence's list of \l{Monomer}s, false otherwise.
*/
bool
Sequence::isInBound(int index)
{
  if(index >= 0 && index < size())
    return true;

  return false;
}

/*!
\brief Removes all spaces, carriage returns and linefeeds from this Sequence's
monomer codes string.
*/
void
Sequence::unspacifyMonomerText()
{
  // Removal of all spaces, carriage returns and linefeeds:

  for(int iter = m_monomerText.length() - 1; iter >= 0; --iter)
    {
      QChar curChar = m_monomerText.at(iter);

      QChar::Category category = curChar.category();

      if(category == QChar::Separator_Space)
        m_monomerText.remove(iter, 1);

      else if(curChar == '\n')
        m_monomerText.remove(iter, 1);
      else if(curChar == '\r')
        m_monomerText.remove(iter, 1);
    }
}

/*!
\brief Creates the monomer codes string version of this Sequence's list of
\l{Monomer} instances and set it to this Sequence

The function essentially writes to the m_monomerText string the code of each
Monomer instance in m_monomerList.

m_monomerText is cleared before writing the codes to it.

Returns the count of monomer codes added to m_monomerText.

\sa makeMonomerList()
*/
int
Sequence::makeMonomerText()
{
  int iter = 0;

  m_monomerText.clear();

  for(iter = 0; iter < m_monomerList.size(); ++iter)
    m_monomerText.append(m_monomerList.at(iter)->code());

  return iter;
}

/*!
\brief Returns an allocated string with the monomer codes.

The returned string only contains the sequence of monomer codes for Monomer
instances between indices [\a start -- \a end] in this Sequence's list
of Monomer instances (m_monomerList).

If \a with_modif is true, the modification(s) associated to \l{Monomer}s are
also output to the string. The form of the string is, in this case,

  \code
  Thr<Phosphorylation>
  \endcode
*/
QString *
Sequence::monomerText(int start, int end, bool with_modif) const
{
  int localStart = 0;
  int localEnd   = 0;

  QString *p_text = new QString();

  if(size() == 0)
    return p_text;

  if(start > end)
    {
      localStart = end;
      localEnd   = start;
    }
  else
    {
      localStart = start;
      localEnd   = end;
    }

  if(localStart < 0)
    localStart = 0;

  if(localEnd < 0 || localEnd >= size())
    localEnd = size() - 1;

  QString text;

  for(int iter = localStart; iter < localEnd + 1; ++iter)
    {
      const Monomer *monomer = m_monomerList.at(iter);

      // FIXME Error, the code below does not seem to work.
      if(with_modif)
        {
          if(monomer->isModified())
            {
              for(int iter = 0; iter < monomer->modifList()->size(); ++iter)
                {
                  text = QString("%1<%2>")
                           .arg(monomer->code())
                           .arg(monomer->modifList()->at(iter)->name());
                }
            }
          else
            text = monomer->code();
        }
      else
        text = monomer->code();

      p_text->append(text);
    }

  return p_text;
}


/*!
\brief Returns an allocated string with the monomer codes.

The returned string only contains the sequence of monomer codes for Monomer
instances contained in the regions (\l Coordinates) described in \a
coordinate_list.

If \a with_modif is true, the modification(s) associated to \l{Monomer}s are
also output to the string. The form of the string is, in this case,

  \code
  Thr<Phosphorylation>
  \endcode

If \a delimited_regions, the sequence of Monomer codes beloging to each will be
delimited using the Coordinates positions.

\sa Coordinates::positionsAsText()
*/
QString *
Sequence::monomerText(const CoordinateList &coordinate_list,
                      bool with_modif,
                      bool delimited_regions) const
{
  QString *p_text = new QString();

  for(int iter = 0; iter < coordinate_list.size(); ++iter)
    {
      // New coordinates instance we are iterating into.
      Coordinates *coordinates = coordinate_list.at(iter);

      QString *tempString =
        monomerText(coordinates->start(), coordinates->end(), with_modif);

      if(delimited_regions)
        *p_text += QString("Region %1: %2\n")
                     .arg(coordinates->positionsAsText())
                     .arg(*tempString);
      else
        *p_text += *tempString;

      delete(tempString);
    }

  *p_text += QString("\n");

  return p_text;
}


/*!
\brief Allocates all the Monomer instances to describe this Sequence's string
representation of monomer codes.

This function parses the member Monomer codes string (m_monomerText) and, for
each encountered code, creates a \l Monomer instance and add it to the member
list of Monomer instances (m_monomerList).

If \a reset is true, the  member list of Monomer instances is reset before the
work is done.

If \a errorList is non-nullptr, errors are stored in this list in the form of
the indices of failing monomer codes in the string.

The allocation of each Monomer instance based on its code is performed by
looking at the reference Monomer list in \a pol_chem_def_csp.

Because the m_monomerText member string of Monomer codes does not document any
monomer modification, no modifications are handled in this function.

Returns the size of the Monomer instances list or -1 if an error occurred.
  */
int
Sequence::makeMonomerList(PolChemDefCstSPtr pol_chem_def_csp,
                          bool reset,
                          QList<int> *errorList)
{
  if(!pol_chem_def_csp)
    {
      qDebug() << "The PolChemDef pointer is nullptr!";
      return -1;
    }

  // If error indices are to be stored, the list MUST be empty.
  if(errorList)
    Q_ASSERT(errorList->size() == 0);

  if(reset)
    {
      while(!m_monomerList.isEmpty())
        delete m_monomerList.takeFirst();
    }

  unspacifyMonomerText();

  // qDebug() << "Sequence:" << m_monomerText;

  int index = 0;
  int ret   = -1;
  QString err;
  QString code;

  ret = nextCode(&code, &index, &err, pol_chem_def_csp->codeLength());

  const QList<Monomer *> &refList = pol_chem_def_csp->monomerList();

  while(1)
    {
      if(ret < 0)
        {
          // There was an error in the parsed code. Store the index.
          if(errorList)
            {
              errorList->append(index);
              ++index;
              ret =
                nextCode(&code, &index, &err, pol_chem_def_csp->codeLength());
              continue;
            }
          else
            {
              break;
            }
        }

      if(ret == 0)
        break;

      Monomer *monomer = new Monomer(pol_chem_def_csp, "NOT_SET");

      if(Monomer::isCodeInList(code, refList, monomer) == -1)
        {
          qDebug() << "Monomer:" << monomer->name()
                   << "was not found in the monomer reference list.";

          delete monomer;

          if(errorList)
            {
              errorList->append(index);
              ++index;
              ret =
                nextCode(&code, &index, &err, pol_chem_def_csp->codeLength());
              continue;
            }
          else
            {
              return -1;
            }
        }
      else
        {
          // qDebug() << "Monomer:" << monomer->name()
          //<< "with code:" << monomer->code()
          //<< "was indeed found in the monomer reference list.";
        }

      m_monomerList.append(monomer);

      // qDebug() << "New monomer:" << monomer->name();

      ++index;

      // 	qDebug() << "index:" << index;

      ret = nextCode(&code, &index, &err, pol_chem_def_csp->codeLength());
    }
  // End of
  // while(1)

  if(errorList)
    {
      if(errorList->size())
        return -1;
    }

  if(ret == -1)
    return -1;

  return m_monomerList.size();
}


/*!
\brief Seeks the next code occurring in this Sequence's string of Monomer
codes.

This function starts looking in this Sequence's string of Monomer codes
(m_monomerText) at \a index. The next found Monomer code is stored in \a code.
If the text is not a monomer code, it is set to \a err.

The parsing of this Sequence's string of Monomer codes takes into account the
\a code_length.

Returns the count of characters that make \a code. This count can be used
to search for the next code by setting its value incremented by 1 to \a index
for a next function call.
  */
int
Sequence::nextCode(QString *code, int *index, QString *err, int code_length)
{
  QString newCode;
  int iter = 0;

  // We get a sequence of monomer codes(like "LysArgGlu" for example)
  // and we have to return the next code starting from *index. Note
  // that the sequence must not contain invalid characters. The
  // invalid characters might be placed in err for further scrutiny by
  // the caller.

  // Returns the count of actually parsed characters in the string
  // newCode(copied to 'code' param). If an error occurs -1 is
  // returned and the faulty character is copied in 'err'. 'index' is
  // updated with the index of the last valid character parsed for
  // current code.

  Q_ASSERT(code);
  Q_ASSERT(index);
  Q_ASSERT(err);

  code->clear();
  err->clear();

  int codes_string_length = m_monomerText.length();

  while(1)
    {
      if(iter >= code_length)
        {
          // Because we have progressed farther than authorized by
          // the number of characters allowed in the monomer codes
          // of this polymer chemistry definition, we decrement iter
          // and break the loop... Later in this function, we'll set
          // the proper index in the sequence where next parsing run
          // should occurs (the calling function will increment
          // *index by one).

          --iter;
          break;
        }

      if(iter + *index >= codes_string_length)
        break;

      QChar curChar = m_monomerText.at(iter + *index);

      if(!curChar.isLetter())
        {
          // 	    qDebug() << __FILE__ << __LINE__
          // 		     << "The character is not a letter:"
          // 		     << curChar;

          *err = curChar;

          // The non-Letter character might be '/', which would be
          // perfectly fine, as we use it to symbolize the actual
          // cleavage site. Which means that we will continue
          // parsing the rest of the string : we have to give the
          // current position back to the caller in the *index
          // variable for the next call to this function to start at
          // next character (not falling back to '/', which would
          // make us enter in an infinite loop).

          *index = *index + iter;

          return -1;
        }

      bool isLower = (curChar.category() == QChar::Letter_Lowercase);

      if(iter == 0)
        {
          if(isLower)
            {
              // qDebug() << __FILE__ << __LINE__
              // << "First character of monomer code might not be"
              // << "lower case; sequence is"
              // << m_monomerText;

              *err = curChar;

              return -1;
            }
          else
            {
              // Good, first char is uppercase.
              newCode += curChar;
            }
        }
      else //(iter != 0)
        {
          // We are not in our first iteration. So either the current
          // character is lowercase and we are just continuing to
          // iterate into a multi-char monomer code, or the current
          // character is uppercase, in which case we are starting to
          // iterate in a new monomer code.

          if(isLower)
            newCode += curChar;
          else
            {
              // Decrement iter, because this round was for nothing:
              // we had "invaded" the next monomer code in sequence,
              // which we must not do.

              --iter;
              break;
            }
        }

      ++iter;
    }

  // We finished parsing at most codeLength characters out of
  // 'm_monomerText', so we have a valid code in the 'code' variable. We
  // can also compute a new index position in the sequence and return
  // the number of characters that we effectively parsed. Note that
  // the caller will be responsible for incrementing the 'index' value
  // by one character unit so as not to reparse the last characters of
  // the sent 'code' object.

  *index = *index + iter;
  *code  = newCode;
  err->clear();

  return code->length();
}

/*!
\brief Searches in for a Sequence textual \a motif in this Sequence's
list of Monomer instances starting at \a index.

\a motif, a text string is first converted to a list of Monomer instances
(using the reference list of monomers in \a pol_chem_def_csp). Then, this
Sequence's monomer instances list is searched for a monomer stretch matching
that created for \a motif.

As soon as a monomer code stretch is found, the index in this Sequence's list of
Monomer instances is set to \a index.

Returns 1 if \a motif was found in this Sequence, 0 otherwise.
*/

int
Sequence::findForwardMotif(Sequence *motif,
                           PolChemDefCstSPtr pol_chem_def_csp,
                           int *index)
{
  Q_ASSERT(motif);
  Q_ASSERT(pol_chem_def_csp);
  Q_ASSERT(index);

  if(*index < 0)
    return -1;
  if(*index >= size())
    return -1;

  int motifSize = motif->size();

  // If motif's length is 0, then nothing to search for, return
  // unmodified 'index'.
  if(!motifSize)
    return 0;

  // Simple optimization, if index + size of motif is greater then
  // size of sequence, return right away.
  if(*index + motifSize >= size())
    return 0;

  // First, make a monomerList.
  if(motif->makeMonomerList(pol_chem_def_csp) == -1)
    return -1;

  // Compare *this sequence with the one in 'motif', starting at index
  // 'index' in *this sequence and 0 in 'motif'.

  bool matched   = false;
  int matchIndex = 0;

  for(int iter = *index; iter < size(); ++iter)
    {
      matched  = false;
      int jter = 0;

      const Monomer *monomer      = at(iter);
      const Monomer *motifMonomer = motif->at(jter);

      // We do not compare with operator == because that comparison
      // would involve the comparison of modifications inside the
      // monomers, which would not work here.
      if(monomer->code() != motifMonomer->code())
        continue;

      // An easy check is to see if the number of remaining monomers
      // in the polymer sequence is compatible with the number of
      // monomers still to be matched in the find array.  Imagine the
      // sequence of the polymer ends like this: ==========JTOUTVU and
      // the sequence to be searched for is : TVUL What we see is that
      // the T of the TVU of the sequence matches; however we can stop
      // the search right away because there is a 'L' in the search
      // pattern that is not present in the end part of the
      // sequence. This is exactly what is checked below.  Note that
      // this check makes SURE that at the end of the second inner
      // loop, when we get out of it, the sole reason we may not
      // consider that the match did not occur is because actually two
      // monomers differred and not because anybody came out of the
      // borders of the sequence in neither the array of the sequence
      // to be searched, nor the array of the polymer sequence. This
      // makes it very easy to assess if a match occurred or not.

      if(size() - iter < motif->size() - jter)
        {
          // Note that if it were ==, then it would have been possible
          // that the sequence "just-in-time" match prior to ending of
          // the polymer sequence array. Do not forget that we are in
          // forward mode, thus we can break immediately, because we
          // are certain that we won't have any chance to find the
          // sequence downstream of current index.

          matched = false;
          break;
        }

      matchIndex = iter;

      // We have to set the matched boolean to true, because if the
      // motif to find is one monomer-long, then the loop below will
      // not be entered, and we'll fail to know that the match
      // occurred later on.
      matched = true;

      // Now that we have our anchoring point in the *this sequence,
      // let's iterate in the motif, and check if the identity in
      // sequence goes along.

      for(int kter = jter + 1; kter < motif->size(); ++kter)
        {
          // At first run in this loop, we are in the second cell of
          // the find list, which means that we should have jter ==
          // 1. And we should compare its contents with those of the
          // cell in the sequence list at index(iter + jter).

          monomer      = at(iter + kter);
          motifMonomer = motif->at(kter);

          // We do not compare with operator == because that
          // comparison would involve the comparison of modifications
          // inside the monomers, which would not work here.
          if(monomer->code() == motifMonomer->code())
            {
              // The monomers still match.
              matched = true;
              continue;
            }
          else
            {
              matched = false;
              break;
            }
        }
      // End of
      // for (int kter = jter + 1 ; kter < motif->size() ; ++kter)

      // At this point, we either have normally extinguished the run
      // in the inner loop, or we have gone out of it before its
      // normal termination. In either case, we have to test if the
      // match occurred or not.

      // Check if the match did NOT occur:

      if(!matched)
        {
          // We just continue with the outer loop, that is, we continue
          // searching in this sequence for a match with the
          // first monomer in the motif.

          continue;
        }
      else
        {
          // The match indeed occurred.

          *index = matchIndex;
          return 1;
        }
    }
  // End of
  // for (int iter = *index; iter < size(); ++iter)


  // No match could be achieved, we have to let the caller function
  // know this in a durable manner : returning 0.

  return 0;
}

/*!
\brief Returns the Monomer instance at index \a index in this Sequence's
monomer instance list.
  */
const Monomer *
Sequence::at(int index) const
{
  qDebug() << "In call at() with value:" << index;

  if(index < 0)
    qFatal("%s@%d -- Index cannot be less than 0.", __FILE__, __LINE__);

  if(index > m_monomerList.size())
    qFatal("%s@%d -- Index cannot be greater than polymer size.",
           __FILE__,
           __LINE__);

  return m_monomerList.at(index);
}

/*!
\brief Returns the \index of \a monomer in this Sequence's list of Monomer
instances.

The search is based on comparison of the pointers, that is, the returned
index is for the \e same monomer object.

Returns -1 if \a monomer is not found.
  */
int
Sequence::monomerIndex(const Monomer *monomer)
{
  for(int iter = 0; iter < m_monomerList.size(); ++iter)
    {
      if(m_monomerList.at(iter) == monomer)
        return iter;
    }

  return -1;
}

/*!
\brief Insert \a monomer at index \a index.

Returns true.
*/
bool
Sequence::insertMonomerAt(const Monomer *monomer, int index)
{
  Q_ASSERT(monomer);
  Q_ASSERT(index > -1 && index <= size());

  m_monomerList.insert(index, monomer);

  return true;
}


bool
Sequence::prepareMonomerRemoval([[maybe_unused]] const Monomer *monomer)
{
  return true;
}

/*!
\brief Removes the monomer instance at index \a index from this Sequence's list
of Monomer instances.

Returns true.
*/
bool
Sequence::removeMonomerAt(int index)
{
  Q_ASSERT(index > -1);
  Q_ASSERT(index < size());

  const Monomer *monomer = at(index);

  if(!prepareMonomerRemoval(monomer))
    return false;

  m_monomerList.removeAt(index);

  delete monomer;

  return true;
}

/*!
\brief Validates this Sequence using \a pol_chem_def_csp as the reference
polymer chemistry definition.

Returns true if all the monomers in textual representation of the sequence
(m_monomerText) could be converted into Monomer instances. This conversion
actually fills in m_monomerList. If an error occurred, returns false.

\sa makeMonomerList()
*/
bool
Sequence::validate(PolChemDefCstSPtr pol_chem_def_csp)
{
  Q_ASSERT(pol_chem_def_csp);

  if(makeMonomerList(pol_chem_def_csp) > -1)
    return true;

  return false;
}

/*!
\brief Returns a checksum calculated on this Sequence's portion contained in
[\a index_start -- \a index_end].

The sequence matching the [\a index_start -- \a index_end] range is extracted
from m_monomerText, with (\a with_modifs is true) or without (\a with_modifs is
false) the monomer modifications. The checksum is computed on that extracted
string.

Returns the checksum.
*/
quint16
Sequence::checksum(int index_start, int index_end, bool with_modifs) const
{
  if(!size())
    return 0;

  QString *text = monomerText(index_start, index_end, with_modifs);

  QByteArray bytes = text->toUtf8();

  quint16 checksum = qChecksum(QByteArrayView(bytes));

  //     qDebug() << __FILE__ << __LINE__
  // 	     << "checksum:" << checksum;

  return checksum;
}


} // namespace libXpertMass

} // namespace MsXpS
