
/******************************************************************************
* MODULE     : parsehtml.gen.cc
* DESCRIPTION: conversion of logical html trees into edit trees
* COPYRIGHT  : (C) 2000  Joris van der Hoeven
*******************************************************************************
* This software falls under the GNU general public license and comes WITHOUT
* ANY WARRANTY WHATSOEVER. See the file $TEXMACS_PATH/LICENSE for more details.
* If you don't have this file, write to the Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
******************************************************************************/

#include <analyze.gen.h>
#include <vars.gen.h>

#module code_fromhtml
#import analyze
#import vars

tree html_args_to_tree (tree t, int start= 1);
tree var_html_args_to_tree (tree t);
tree html_to_tree (tree t);
tree merge_concats (tree t1, tree t2);
tree upgrade_html (tree t);

/******************************************************************************
* General purpose subroutines
******************************************************************************/

string
html_get_attribute (tree t, string attr) {
  int i, n= N(attr)+1;
  for (i=1; i<arity(t); i++) {
    string s= as_string (t[i]);
    if ((N(s) >= n) && (upcase_all (s (0, n)) == (attr * "=")))
      return s (N(attr)+1, N(s));
  }
  return "";
}

string
html_get_text_attribute (tree t, string attr) {
  string s= html_get_attribute (t, attr);
  if ((N(s)<2) || (s[0]!='\042') || (s[N(s)-1]!='\042')) return s;
  return s (1, N(s)-1);
}

/******************************************************************************
* Specific translations
******************************************************************************/

tree
html_to_tree_section (tree t, string section) {
  tree r= tree (CONCAT, tree (FORMAT, NEW_LINE));
  r << tree (EXPAND, section, var_html_args_to_tree (t));
  r << tree (FORMAT, NEW_LINE);
  return r;
}

tree
html_to_tree_set (tree t, string var, tree what) {
  tree r= tree (CONCAT, tree (SET, copy (var), copy (what)));
  r << A (html_args_to_tree (t));
  r << tree (RESET, copy (var));
  return r;
}

tree
html_to_tree_begin (tree t, string env, bool f1=FALSE, bool f2=FALSE) {
  tree r= tree (CONCAT);
  if (f1) r << tree (FORMAT, NEW_LINE);
  r << tree (BEGIN, copy (env));
  r << A (html_args_to_tree (t));
  r << tree (END, copy (env));
  if (f2) r << tree (FORMAT, NEW_LINE);
  return r;
}

tree
html_to_tree_apply (tree t, string what, bool f1=FALSE, bool f2=FALSE) {
  tree r= tree (CONCAT);
  if (f1) r << tree (FORMAT, NEW_LINE);
  r << tree (EXPAND, copy (what));
  r << A (html_args_to_tree (t));
  if (f2) r << tree (FORMAT, NEW_LINE);
  return r;
}

tree
html_to_tree_image (tree t) {
  if (is_atomic (t)) return "";
  string s= upcase_all (t[0]->label);
  if ((s != "FIG") && (s != "IMG"))
    return html_to_tree_image (t[0]);
  string src= html_get_text_attribute (t, "SRC");
  if (src == "") return "";
  tree r (POSTSCRIPT, src);
  r << "*6383/10000" << "*6383/10000" << "" << "" << "" << "";
  return r;
}

tree
html_to_tree_font (tree t) {
  tree r= html_args_to_tree (t);
  t= t[0];
  if (is_atomic (t)) return "";
  string sz= html_get_attribute (t, "SIZE");
  if (sz != "") {
    string mult= "1";
    if (sz == "-4") mult= "0.5";
    if (sz == "-3") mult= "0.6";
    if (sz == "-2") mult= "0.7";
    if (sz == "-1") mult= "0.8";
    if (sz == "+1") mult= "1.2";
    if (sz == "+2") mult= "1.4";
    if (sz == "+3") mult= "1.7";
    if (sz == "+4") mult= "2";
    tree u= tree (CONCAT, tree (SET, FONT_SIZE, copy (mult)));
    u << A(r) << tree (RESET, FONT_SIZE);
    r= u;
  }
  string col= html_get_text_attribute (t, "COLOR");
  if (col != "") {
    tree u= tree (CONCAT, tree (SET, COLOR, copy (col)));
    u << A(r) << tree (RESET, COLOR);
    r= u;
  }
  if (N(r)==0) return "";
  if (N(r)==1) return r[0];
  return r;
}

tree
html_to_tree_link (tree t) {
  tree u (CONCAT);
  string ref= html_get_text_attribute (t[0], "NAME");
  if (ref != "") u << tree (LABEL, ref);
  if (arity (t) >= 2) {
    tree args= html_args_to_tree (t);
    string link_to= html_get_text_attribute (t[0], "HREF");
    if (link_to == "") u << args;
    else u << tree (HYPERLINK, args, link_to);
  }
  if (N(u) == 0) return "";
  if (N(u) == 1) return u[0];
  return u;
}

/******************************************************************************
* Main translation
******************************************************************************/

tree
html_args_to_tree (tree t, int start) { // Must return a CONCAT
  // cout << "In : " << t << "\n";
  tree r (CONCAT);
  int i, n= arity (t);
  for (i=start; i<n; i++) {
    // cout << "     Get: " << t[i] << "\n";
    tree u= html_to_tree (t[i]);
    // cout << "     Put: " << u << "\n";
    if (is_concat (u)) r << A(u);
    else r << u;
  }
  // cout << "Out: " << r << "\n";
  return r;
}

tree
var_html_args_to_tree (tree t) {
  tree r= html_args_to_tree (t);
  if (N(r)==0) return "";
  if (N(r)==1) return r[0];
  return r;
}

tree
html_to_tree (tree t) {
  // cout << "+++ t= " << t << "\n";
  if (is_atomic (t)) return copy (t);
  string s= upcase_all (is_compound (t[0])? t[0][0]->label: t[0]->label);
  // cout << "+++ s= " << s << "\n";
  if (s == "!DOCTYPE") return "";
  if (s == "HEAD") return "";
  if (s == "HTML") return var_html_args_to_tree (t);
  if (s == "BODY") return var_html_args_to_tree (t);

  // Text body
  if (s == "BANNER") return var_html_args_to_tree (t);
  if (s == "BQ") return html_to_tree_begin (t, "quotation", TRUE, TRUE);
  if (s == "BLOCKQUOTE") return html_to_tree_begin (t,"quotation",TRUE,TRUE);
  if (s == "CRTREE") {
    tree r= html_to_tree_set (t, TEXT_SHAPE, "small-caps");
    return merge_concats (tree (" ("), merge_concats (r, tree (")")));
  }
  if (s == "DIV") return var_html_args_to_tree (t);
  if (s == "FIG") return html_to_tree_image (t);
  if (s == "FN") return html_to_tree_begin (t, "footnote", TRUE, TRUE);
  if (s == "NOTE") return html_to_tree_begin (t, "note", TRUE, TRUE);
  if (s == "H1") return html_to_tree_section (t, "chapter");
  if (s == "H2") return html_to_tree_section (t, "section");
  if (s == "H3") return html_to_tree_section (t, "subsection");
  if (s == "H4") return html_to_tree_section (t, "subsubsection");
  if (s == "H5") return html_to_tree_section (t, "paragraph");
  if (s == "H6") return html_to_tree_section (t, "subparagraph");
  if (s == "HR") {
    tree r (CONCAT);
    r << tree (FORMAT, NEW_LINE);
    r << tree (VALUE, "hrule");
    r << tree (FORMAT, NEW_LINE);
    return r;
  }
  if (s == "DL") return html_to_tree_begin (t, "description", TRUE, TRUE);
  if (s == "DT") {
    tree r (CONCAT);
    r << tree (FORMAT, NEW_LINE);
    r << tree (EXPAND, "item*", var_html_args_to_tree (t));
    return r;
  }
  if (s == "DD") return var_html_args_to_tree (t);
  if (s == "OL") return html_to_tree_begin (t, "enumerate", TRUE, TRUE);
  if (s == "UL") return html_to_tree_begin (t, "itemize", TRUE, TRUE);
  if (s == "LI") return html_to_tree_apply (t, "item", TRUE, TRUE);
  if (s == "UL") return html_to_tree_begin (t, "itemize", TRUE, TRUE);
  if (s == "MENU") return html_to_tree_begin (t, "itemize", TRUE, TRUE);
  if (s == "DIR") return html_to_tree_begin (t, "itemize", TRUE, TRUE);
  if (s == "P") {
    tree r= html_args_to_tree (t);
    r << tree (FORMAT, NEW_LINE);
    return r;
  }
  if (s == "PRE") return html_to_tree_begin (t, "verbatim");
  if (s == "CENTER") return html_to_tree_begin (t, "center");

  // Physical Phrase Markup
  if (s == "FONT") return html_to_tree_font (t);
  if (s == "B") return html_to_tree_set (t, TEXT_SERIES, "bold");
  if (s == "BIG") return html_to_tree_set (t, FONT_SIZE, "1.2");
  if (s == "I") return html_to_tree_set (t, TEXT_SHAPE, "italic");
  if (s == "S") return var_html_args_to_tree (t);
  if (s == "SMALL") return html_to_tree_set (t, FONT_SIZE, "0.83");
  if (s == "SUB") return tree (RIGHT_SUB, var_html_args_to_tree (t));
  if (s == "SUP") return tree (RIGHT_SUP, var_html_args_to_tree (t));
  if (s == "TT") return html_to_tree_set (t, TEXT_FAMILY, "tt");
  if (s == "U") return var_html_args_to_tree (t);

  // Semantic Phrase Markup
  if (s == "ACRONYM") return var_html_args_to_tree (t);
  if (s == "ABBREV") return var_html_args_to_tree (t);
  if (s == "AU") return html_to_tree_set (t, TEXT_SHAPE, "small-caps");
  if (s == "CITE") return html_to_tree_set (t, TEXT_SHAPE, "small-caps");
  if (s == "CODE") return html_to_tree_begin (t, "verbatim");
  if (s == "DEL") return "";
  if (s == "DFN") return html_to_tree_set (t, TEXT_SHAPE, "italic");
  if (s == "EM") return html_to_tree_set (t, TEXT_SHAPE, "italic");
  if (s == "INS") return var_html_args_to_tree (t);
  if (s == "KBD") return html_to_tree_set (t, TEXT_FAMILY, "tt");
  if (s == "LANG") return var_html_args_to_tree (t);
  if (s == "PERSON") return html_to_tree_set (t, TEXT_SHAPE, "small-caps");
  if (s == "Q") {
    tree r (CONCAT, "``");
    r << A (html_args_to_tree (t));
    r << "''";
    return r;
  }
  if (s == "SAMP") return html_to_tree_set (t, TEXT_FAMILY, "tt");
  if (s == "STRONG") return html_to_tree_set (t, TEXT_SERIES, "bold");
  if (s == "VAR") return html_to_tree_set (t, TEXT_SHAPE, "italic");

  // Character Level and Special Elements
  if (s == "BR") return tree (FORMAT, NEW_LINE);
  if (s == "IMG") return html_to_tree_image (t);
  if (s == "A") return html_to_tree_link (t);

  return html_to_tree_begin (t, s);
}

/******************************************************************************
* Useful subroutines
******************************************************************************/

tree
merge_concats (tree t1, tree t2) {
  if (!is_concat (t1)) t1= tree (CONCAT, t1);
  if (!is_concat (t2)) t2= tree (CONCAT, t2);
  t1 << A (t2);
  return t1;
}

tree
sub_concat (tree t, int j, int k) {
  int i;
  tree r (CONCAT, k-j);
  for (i=j; i<k; i++)
    r[i-j]= t[i];
  if (N(r)==0) return "";
  if (N(r)==1) return r[0];
  return r;
}

bool
is_invisible (tree t) {
  if (is_func (t, BEGIN)) return TRUE;
  if (is_func (t, END)) return TRUE;
  if (is_func (t, SET)) return TRUE;
  if (is_func (t, RESET)) return TRUE;
  if (is_func (t, ASSIGN)) return TRUE;
  return FALSE;
}

int
get_verbatim_change (tree t) {
  int i;
  if (t == tree (BEGIN, "verbatim")) return 1;
  if (t == tree (END, "verbatim")) return -1;
  if (is_concat (t)) {
    for (i=N(t)-1; i>=0; i--) {
      if (t[i] == tree (BEGIN, "verbatim")) return 1;
      if (t[i] == tree (END, "verbatim")) return -1;
    }
  }
  return 0;
}

/******************************************************************************
* Remove double spaces
******************************************************************************/

string
remove_double_spaces (string s) {
  int i;
  string r;
  for (i=0; i<N(s); i++)
    if (!test (s, i, "  ")) r << s[i];
  return r;
}

string
remove_starting_spaces (string s) {
  int i= 0;
  while ((i<N(s)) && (s[i]==' ')) i++;
  return s (i, N(s));
}

string
remove_ending_spaces (string s) {
  int i= N(s);
  while ((i>0) && (s[i-1]==' ')) i--;
  return s (0, i);
}

static tree
tm_encode (tree t) {
  if (is_atomic (t)) return tree (tm_encode (t->label));
  else {
    int i, n= N(t);
    tree u (t, n);
    for (i=0; i<n; i++)
      u[i]= tm_encode (t[i]);
    return u;
  }
}

tree
finalize_spaces (tree t) {
  if (is_atomic (t)) return remove_double_spaces (t->label);
  else {
    int i, j, n= N(t);
    tree r (t, n);
    int verbatim_mode= FALSE, verbatim_change;

    for (i=0; i<n; i++) {
      if (is_document (r)) {
	// cout << "r[" << i << "]= " << r[i] << "\n";
	verbatim_change= get_verbatim_change (t[i]);
	if ((verbatim_change!=0) || (!verbatim_mode)) {
	  r[i]= finalize_spaces (t[i]);
	  if (is_atomic (r[i])) {
	    r[i]= remove_starting_spaces (r[i]->label);
	    r[i]= remove_ending_spaces (r[i]->label);
	  }
	  if (is_concat (r[i])) {
	    for (j=0; j<N(r[i]); j++) {
	      if (is_invisible (r[i][j])) continue;
	      if (is_compound (r[i][j])) break;
	      r[i][j]= remove_starting_spaces (r[i][j]->label);
	      if (r[i][j] != "") break;
	    }
	    for (j=N(r[i])-1; j>=0; j--) {
	      if (is_invisible (r[i][j])) continue;
	      if (is_compound (r[i][j])) break;
	      r[i][j]= remove_ending_spaces (r[i][j]->label);
	      if (r[i][j] != "") break;
	    }
	    r[i]= simplify_concat (r[i]);
	  }
	}
	else r[i]= tm_encode (t[i]);
	if (verbatim_change == 1) verbatim_mode= TRUE;
	if (verbatim_change == -1) verbatim_mode= FALSE;
      }

      else {
	r[i]= finalize_spaces (t[i]);
	if (is_concat (r)) {
	  if ((i>0) && is_atomic (r[i]) && is_func (r[i-1], EXPAND) &&
	      ((r[i-1][0] == "item") || (r[i-1][0] == "item*")))
	    r[i]= remove_starting_spaces (r[i]->label);
	}
      }
    }
    
    if (is_concat (r)) return simplify_concat (r);
    return r;
  }
}

/******************************************************************************
* Remove double spaces
******************************************************************************/

bool
mergeable_with_before (tree t) {
  if (is_atomic (t)) return (t == "");
  if (is_func (t, END)) return TRUE;
  if (is_func (t, RESET)) return TRUE;
  if (is_func (t, ASSIGN)) return TRUE;
  if (is_concat (t)) {
    int i, n= N(t);
    for (i=0; i<n; i++)
      if (!mergeable_with_before (t[i]))
	return FALSE;
    return TRUE;
  }
  return FALSE;
}

bool
mergeable_with_behind (tree t) {
  if (is_atomic (t)) return (t == "");
  if (is_func (t, BEGIN)) return TRUE;
  if (is_func (t, SET)) return TRUE;
  if (is_func (t, ASSIGN)) return TRUE;
  if (is_concat (t)) {
    int i, n= N(t);
    for (i=0; i<n; i++)
      if (!mergeable_with_behind (t[i]))
	return FALSE;
    return TRUE;
  }
  if (is_func (t, EXPAND)) {
    if (t[0] == "item") return TRUE;
    if (t[0] == "item*") return TRUE;
  }
  return FALSE;
}

tree
finalize_returns (tree t) {
  int i, n= arity (t);
  int verbatim_mode= FALSE, verbatim_change;
  tree r (DOCUMENT);

  for (i=0; i<n; i++) {
    // cout << "t[" << i << "]= " << t[i] << "\n";
    verbatim_change= get_verbatim_change (t[i]);

    if (verbatim_mode && (verbatim_change==0) && (t[i] == "")) r << " ";
    else if (t[i] != "") {
      if ((N(r)>0) && (mergeable_with_before (t[i]) ||
		       mergeable_with_behind (r[N(r)-1])))
	r[N(r)-1]= merge_concats (r[N(r)-1], t[i]);
      else r << t[i];
    }

    if (verbatim_change == 1) verbatim_mode= TRUE;
    if (verbatim_change == -1) verbatim_mode= FALSE;
  }

  if (N(r)==0) r << "";
  return r;
}

/******************************************************************************
* Finalize_document
******************************************************************************/

tree
finalize_verbatim (tree t) {
  int i, n= arity (t);
  for (i=0; i<n; i++) {
    if (is_concat (t[i]) && (i>0) && (t[i][0] == tree (END, "verbatim"))) {
      t[i-1]= merge_concats (t[i-1], t[i][0]);
      t[i]  = sub_concat (t[i], 1, N(t[i]));
    }
    if (t[i] == tree (BEGIN, "verbatim")) {
      t[i+1]= merge_concats (t[i], t[i+1]);
      t[i]  = "";
    }
    if (is_concat (t[i]) &&
	(i<(N(t)-1)) && (t[i][N(t[i])-1] == tree (BEGIN, "verbatim"))) {
      t[i+1]= merge_concats (t[i][N(t[i])-1], t[i+1]);
      t[i]  = sub_concat (t[i], 0, N(t[i])-1);
    }
  }
  return t;
}

tree
finalize_document (tree t) {
  int i, n= arity (t);
  tree d (DOCUMENT);
  tree c (CONCAT);

  for (i=0; i<n; i++) {
    if (t[i] == tree (FORMAT, NEW_LINE)) {
      if (N(c)==0) c="";
      else if (N(c)==1) c=c[0];
      d << c;
      c= tree (CONCAT);
    }
    else {
      if (is_atomic (t[i]) && (N(c)>0) && is_atomic (c[N(c)-1]))
	c[N(c)-1]->label << t[i]->label;
      else c << t[i];
    }
  }
  if (N(c)==0) c="";
  else if (N(c)==1) c=c[0];
  d << c;

  return finalize_verbatim (finalize_returns (finalize_spaces (d)));
}

/******************************************************************************
* Interface
******************************************************************************/

tree
html_to_tree (string s, string mode) {
  (void) mode;
  tree t= parse_html (s);
  return upgrade_html (finalize_document (html_args_to_tree (t, 0)));
}

tree
html_document_to_tree (string s, string& style) {
  tree t= html_to_tree (s, "text");
  style= "browser";
  return t;
}

#endmodule // code_fromhtml
