/*
  clean2003.c - 20070523 - additional clean up of Word HTML markup

  (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
  See tidy.h for the copyright notice.

  After using CleanWord2000 there are still some rendering
  problems regarding paragraphs. This module attempts to
  fix that such that the result is rendered closer to the
  original.

  With bear, and word-2000, in order to make the browser rendering
  consistent with  the original rendering attempt to strip 'blank'
  paragraphs. These can have &nbsp; in them, like
  <p>text 1</p>
  <p>&nbsp;</p> - this is removed
  <p>text 2</p>
 
  However if paragraphs are consecutive
  <p>text 1</p>
  <p>text 2</p>
  <p>text 3</p>
  then change to single paragraph, using <br> to spearate lines
  <p>text 1<br>
  text 2<br>
  text 3</p>

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "tidy-int.h"
#include "clean.h"
#include "lexer.h"
#include "parser.h"
#include "attrs.h"
#include "message.h"
#include "tmbstr.h"
#include "utf8.h"

#ifndef  NDEBUG
#include "sprtf.h"
extern void tidydbg_ShowNode( TidyDocImpl* doc, Node* node, tmbstr src );
#else
#define  sprtf printf
#endif   /* if DEBUG */

/* structure used to 'type' paragraphs */
typedef struct {
   int   intoffset;
   int   setofints[1];
} PARASTR, * PPARASTR;

/************************************************************************
* Clean2003_CountParas( TidyDocImpl* doc, Node * node )
*
* A simple count of paragraphs in the tree
* Returns count.
************************************************************************/

static long  Clean2003_CountParas( TidyDocImpl* doc, Node * node )
{
   long cnt = 0;
   while ( node )
   {
#ifndef  NDEBUG
      tidydbg_ShowNode( doc, node, 0 );
#endif   /* if DEBUG */
      if ( nodeIsP(node) )
      {
         cnt++;
      }

      if ( node->content )
         cnt += Clean2003_CountParas( doc, node->content );

      node = node->next;
   }
   return cnt;
}

/*******************************************************
* Clean2003_textFromPNode
*
* Returns a list of characters contained within node,
* exploring 'content' until 'text' found ...
* Returns NULL, if NO TEXT FOUND,
* else buffered characters up to TIDY_MAX_TEXT
* And returns "NOT text content" in the buffer, if
* the node has NO text, but does have content.
*******************************************************/
#define  TIDY_MAX_TEXT  256
static uint Clean2003_Get_Text( TidyDocImpl * doc, Node * node, tmbstr txt, uint x, Bool all )
{
   uint i;
   for (i = node->start; i < node->end; i++ )
   {
      uint c = (byte)doc->lexer->lexbuf[i];
      /* look for UTF-8 multibyte character */
      if ( c > 0x7F )
         i += TY_(GetUTF8)( doc->lexer->lexbuf + i, &c );
      /* Check buffer overflow */
      if ( x >= TIDY_MAX_TEXT-1 )
         break;
      if( all || ( ( c != 160 ) && !TY_(IsWhite)( c ) ) )
         txt[x++] = (tmbchar)c;
   }
   return x;
}

static tmbstr Clean2003_textFromPNode( TidyDocImpl* doc, Node* innode, Bool all )
{
   static char _s_text_buf[TIDY_MAX_TEXT];
    uint x = 0;
    Node * node = innode;
    tmbstr txt = _s_text_buf;
    while ( !TY_(nodeIsText)(node) && node->content )
       node = node->content;

    if ( TY_(nodeIsText)(node) )
    {
        while( node && TY_(nodeIsText)(node) )
        {
           /* Copy contents of a text node */
           x = Clean2003_Get_Text( doc, node, txt, x, all );
           node = node->next; /* get successive NEXT text nodes */
        }
        if ( x ) {
          txt[x] = '\0';
          return txt;
        }
    }
    else if ( innode->content )
    {
       strcpy(txt, "NOT text content");
       return txt;
    }
    return NULL;
}

/************************************************************************
* Bool Clean2003_IsWhitespace( ctmbstr pString )
*
* Checks if the given string is all whitespace.
* Returns 'yes' if it is, 'no' if it's not.
* include &nbsp; parsing
************************************************************************/

static Bool Clean2003_IsWhitespace( ctmbstr pString )
{
    Bool isWht = yes;
    byte * cp;

    for ( cp = (byte *)pString; isWht && cp && *cp; ++cp )
    {
       uint c = *cp;
       if (( c == '&' ) && ( TY_(tmbstrlen)((ctmbstr)cp) >= 6 ) &&
          ( TY_(tmbstrncmp)( (ctmbstr)cp, "&nbsp;", 6 ) == 0 ) )
       {
          cp += 6 - 1;  /* note loop will do final increment */
       }
       else if ( c != 160 )   /* let UTF-8 space pass as WHITE */
       {
          isWht = TY_(IsWhite)( *cp );
       }
    }
    return isWht;
}

/************************************************************************
* void  Clean2003_SetParas( TidyDocImpl* doc, Node * node, PPARASTR pps )
*
* Checks paragraphs in tree.
* If 'spacey' set 0 at offset, else
* set 1 at paragraph offset.
************************************************************************/

static void  Clean2003_SetParas( TidyDocImpl* doc, Node * node, PPARASTR pps )
{
   while ( node )
   {
#ifndef  NDEBUG
      tidydbg_ShowNode( doc, node, 0 );
#endif   /* if DEBUG */
      if ( nodeIsP(node) )
      {
         tmbstr word = Clean2003_textFromPNode( doc, node, no );
         if ( word == NULL || Clean2003_IsWhitespace(word) ) {
            pps->setofints[pps->intoffset++] = 0;   /* set as BLANK */
         } else {
            pps->setofints[pps->intoffset++] = 1;   /* else para with text (or img, ...) */
         }
      }

      if ( node->content )
         Clean2003_SetParas( doc, node->content, pps );

      node = node->next;
   }
}

/************************************************************************
* Clean2003_DiscardContainer
*
*  Extricate "element", replace it by its content and delete it,
*  returning its 'contents', if any, in the Node **
************************************************************************/

static void Clean2003_DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
{
    if (element->content)
    {
        Node *node, *parent = element->parent;

        element->last->next = element->next;

        if (element->next)
        {
            element->next->prev = element->last;
        }
        else
            parent->last = element->last;

        if (element->prev)
        {
            element->content->prev = element->prev;
            element->prev->next = element->content;
        }
        else
            parent->content = element->content;

        for (node = element->content; node; node = node->next)
            node->parent = parent;

        *pnode = element->content;

        element->next = element->content = NULL;
        TY_(FreeNode)(doc, element);
    }
    else
    {
        *pnode = TY_(DiscardElement)(doc, element);
    }
}

/************************************************************************
*  CleanWord2003
*
*  The main purpose of this module.
*  Iterate through paragraph nodes ... If it -
*  (a) is 'all spacey', DELETE it.
*  (b) is NOT preceeded by a now deleted 'spacey' paragraph,
*  and it has a paragraph parent (prev), discard the paragraph container,
*  insert a <br>, and append its contents to that parent. or
*  (c) do nothing.
************************************************************************/

static void _s_CleanWord2003( TidyDocImpl* doc, Node *node, PPARASTR pps )
{
   while ( node )
   {
      if ( nodeIsP(node) )
      {
         int   last = pps->intoffset; /* keep this offset, and */
         pps->intoffset++;            /* bump to next ... */
         if ( pps->setofints[last] == 0 )
         {
            node = TY_(DiscardElement)( doc, node );
            continue;
         }
         else if ( last && pps->setofints[last-1] && /* if NO spacey para DISCARDED */
            node->prev && nodeIsP(node->prev) )
         {
            /* ok, previous was NOT a BLANK, thus is continuous paragraphs -
               these should use <br>, not <p>! */
            Node * next, * brnode, * tnode, * parent;
            parent = node->prev;    /* extract the PARENT <p> to append to it ... */
            brnode = TY_(InferredTag)(doc, TidyTag_BR); /* Create a <br> node */
            Clean2003_DiscardContainer( doc, node, &next ); /* discard the <p> container, get text */
            tnode = TY_(RemoveNode)(next);  /* take NODE, (and children) out of tree */
            TY_(InsertNodeAtEnd)(parent, brnode);  /* add new <br> to parent */
            TY_(InsertNodeAtEnd)(parent, tnode);   /* add extracted 'node(s)' to parent */
            node = parent->next; /* load next, and */
            continue;   /* loop */
         }
      }

      if ( node->content )
         _s_CleanWord2003( doc, node->content, pps );

      node = node->next;
   }
}

void  _test_show( TidyDocImpl * doc, Node * node, uint level )
{
   while(node)
   {
#ifndef  NDEBUG
      if(level) {
         uint  lev = level;
         while(lev--)
            sprtf(" ");
      }
      tidydbg_ShowNode( doc, node, 0 );
#endif   /* if DEBUG */

      if( node->content ) {
         sprtf( " C " );
         _test_show( doc, node->content, level + 1 );
      }

      node = node->next;
      if(node)
         sprtf( " N " );
      else {
         sprtf( " E " );
         if(level) {
            uint  lev = level - 1;
            while(lev--)
               sprtf(" ");
         }
         sprtf( "X\n" );
      }
   }
}

/************************************************************************
*  PUBLIC: _TY(CleanWord2003)( TidyDocImpl* doc, Node *node)
*
*  1. Count the paragraphs in the parse tree.
*  2. Allocate a structure to carry information.
*  3. Process the paragraphs, setting their 'type' into the structure.
*  4. Finally, process the paragraphs according to the previous type
*  5. Toss the allocated memory.
*
*  The AIM of this clean up is to take a content like -
*  <p>Line 1</p>
*  <p>&nbsp;</p>
*  <p>Line 3</p>
*  <p>Line 4</p>
*  and convert it to -
*  <p>Line 1</p>
*  <p>Line 3<br>
*  Line 4</p>
*
************************************************************************/
void TY_(CleanWord2003_TEST)( TidyDocImpl* doc, Node *node)
{
#ifndef  NDEBUG
   sprtf( "\nCleanWord2003: Calling _test_show ...\n" );
   _test_show( doc, node, 0 );
#endif   /* if DEBUG */
   return;
}
void TY_(CleanWord2003)( TidyDocImpl* doc, Node *node)
{
   long cnt;
   PPARASTR pps;
   if ( !cfgBool(doc, TidyMakeBare) )
      return;

#ifndef  NDEBUG
   sprtf( "\nBEFORE CleanWord2003: Calling _test_show ...\n" );
   _test_show( doc, node, 0 );
#endif   /* if DEBUG */

#ifndef  NDEBUG
   sprtf( "Calling Clean2003_CountParas ...\n" );
#endif   /* if DEBUG */
   cnt = Clean2003_CountParas( doc, node ); /* count paragraphs */
   pps = (PPARASTR)TidyDocAlloc( doc, sizeof(PARASTR) + (sizeof(int) * cnt) );
   if ( pps )
   {
      pps->intoffset = 0;                    /* reset offset */
#ifndef  NDEBUG
      sprtf( "Calling Clean2003_SetParas ...\n" );
#endif   /* if DEBUG */
      Clean2003_SetParas( doc, node, pps );  /* mark the array */
      pps->intoffset = 0;                    /* reset offset */
#ifndef  NDEBUG
      sprtf( "Calling CleanWord2003 ...\n" );
#endif   /* if DEBUG */
      _s_CleanWord2003( doc, node, pps );       /* do the 'extra' cleaning */
      TidyDocFree( doc, pps );               /* free the memory used */
#ifndef  NDEBUG
      sprtf( "\nAFTER CleanWord2003: Calling _test_show ...\n" );
      _test_show( doc, node, 0 );
#endif   /* if DEBUG */
   }
}

void show_tree( TidyDocImpl * doc )
{
   sprtf( "Ready for OUTPUT ...\n" );
   _test_show( doc, &doc->root, 0 );
   sprtf( "<debug stop>" );
}

/* eof - clean2003.c */
