/*
 * Caudium - An extensible World Wide Web server
 * Copyright  2000-2001 The Caudium Group
 * Copyright  1994-2001 Roxen Internet Software
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */
/*
 * $Id: camas_html.pike,v 1.10.2.3.2.3 2001/09/24 11:10:57 kiwi Exp $
 */

#include <module.h>
inherit "module";

inherit "caudiumlib";

//
//! module: CAMAS: HTML Module
//!  Module to render HTML mails into CAMAS.<br />
//!  Used when option "show HTML mails" is enabled. It is used for <i>safe 
//!  html rendering</i> the HTML mails received before displaying them to end
//!  CAMAS users. You can specify there what HTML code is safe or not.<br />
//!  <b>This module is automatically selected if you select "CAMAS: Main 
//!  module".</b>
//! inherits: module
//! inherits: caudiumlib
//! type: MODULE_PROVIDER
//! cvs_version: $Id: camas_html.pike,v 1.10.2.3.2.3 2001/09/24 11:10:57 kiwi Exp $
//

constant cvs_version = "$Id: camas_html.pike,v 1.10.2.3.2.3 2001/09/24 11:10:57 kiwi Exp $";
constant module_type = MODULE_PROVIDER;
constant module_name = "CAMAS: HTML Module";
constant module_doc  = "Module to render HTML mails into CAMAS.<br />"
                       "Used when option \"show HTML mails\" is enabled. It is "
                       "used for <i>safe html rendering</i> the HTML mails "
                       "received before displaying then to end CAMAS users. "
                       "You can specify there what HTML code is safe or not. "
                       "<br />This module is automatically selected if you "
                       "select \"CAMAS: Main module\".</b>";
constant module_unique = 1;
constant thread_safe = 1;		// I think this module should be :)

// --------------------------------------------------------------------------------------

constant html4_containers = ({ 
  "a", "abbr", "acronym", "address", "applet", "area", "b", "base", "bdo", 
  "big", "blockquote", "button", "caption", "center", "cite", "code", 
  "colgroup", "dd", "del", "dfn", "dir", "dl", "dt", "em", "fieldset", 
  "form", "font", "frameset", "h1", "h2", "h3", "h4", "h5", "h6",
  "i", "iframe", "ins", "isindex", "kbd", "label",
  "legend", "li", "map", "menu", "noframes", "noscript", "object",
  "ol", "optgroup", "option", "p", "pre", "q", "s", "samp", "script", "select",
  "small", "strike", "strong", "sub", "sup", "table", "tbody", "td",
  "textarea", "tfoot", "th", "thead", "tr", "tt", "u", "ul", "var"
});

constant html4_tags = ({
  "area", "base", "basefont", "br", "col", "hr", "frame", "img", "input",
  "meta", "param"
});

constant html4_css = ({
  "div", "span", "style", /* not HTML 4.01: */ "layer"
});

constant tag_references = ({ 
  "action", "background", "cite", "classid", "codebase", "data",
  "href", "longdesc", "rel", "src", "usemap"
});

// --------------------------------------------------------------------------------------

mapping (string:mixed) containers = ([ ]), tags = ([ ]);
mapping (string:mixed) containers_inline = ([ ]), tags_inline = ([ ]);
mapping (string:mixed) containers_mailpart = ([ ]), tags_mailpart = ([ ]);

string describe_allowed () {
  return "";
}

string describe_form_allowed (mixed var, mixed path) {
  mixed misc = var[VAR_MISC][3];
  string res = "<select name=\"" + path + "\" multiple size=\"10\">";
  for (int i = 0; i < sizeof (misc); i++) {
    if (var[VAR_VALUE] && has_value (var[VAR_VALUE], misc[i]))
      res += "<option selected>" + misc[i] + "</option>";
    else
      res += "<option>" + misc[i] + "</option>";
  }
  res += "</select>&nbsp;<input type=\"submit\" value=\"Ok\">";
  return res;
}

array set_from_form_allowed (string val, int type, object o, mixed ... rest) {
  return (val / "\0");
}

void create () {
  defvar ("debug", 0, "Debug:General debug", TYPE_FLAG,
	  "When on, <b>general</b> debug messages will be logged in Caudium's debug logfile. "
	  "This information is very useful to the developers when fixing bugs.");

  defvar ("debuglinks", 0, "Debug:Debug links", TYPE_FLAG,
	  "When on, <b>link relocation</b> debug messages will be logged in Caudium's debug logfile. "
	  "This information is very useful to the developers when fixing bugs.");

  defvar ("nojs", "both", "Disallow JavaScript", TYPE_STRING_LIST,
	  "Set this if you do not want JavaScript to pass through. <br />"
	  "It will remove all 'on...' attributes in HTML elements, and the &lt;script&gt; container.",
	  ({ "inline", "mailpart", "both", "no" }));

  defvar ("nocss", "both", "Disallow Cascading Style Sheets", TYPE_STRING_LIST,
	  "Set this if you do not want style sheets to pass through. <br />"
	  "Removed containers:<ul>"
	  "<li>&lt;span&gt;</li>"
	  "<li>&lt;style&gt;</li>"
	  "<li>&lt;div&gt;</li>"
	  "</ul>"
	  "Removed tags:<ul>"
	  "<li>&lt;link&gt;</li>"
	  "</ul>",
	  ({ "inline", "mailpart", "both", "no" }));

  defvar ("allowed_containers", ({ }), "Allowed HTML containers", TYPE_CUSTOM,
	  "The containers allowed to pass through.",
	  // function callbacks for the configuration interface
	  ({ describe_allowed, describe_form_allowed, set_from_form_allowed,
	     // extra argument
	     (html4_containers - ({ "applet", "object", "var" })) }) );

  defvar ("allowed_tags", ({ }), "Allowed HTML tags", TYPE_CUSTOM,
	  "The tags allowed to pass through.",
	  // function callbacks for the configuration interface
	  ({ describe_allowed, describe_form_allowed, set_from_form_allowed,
	     // extra argument
	     (html4_tags - ({ "meta", "param" })) }) );
}

int do_js (int mailpart) {
  int test = (mailpart) ? (!(< "mailpart", "both" >)[QUERY (nojs)]) : (!(< "inline", "both" >)[QUERY (nojs)]);
  if (QUERY (debug))
    write ("do_js => " + test + "\n");
  return test;
}

int do_css (int mailpart) {
  int test = (mailpart) ? (!(< "mailpart", "both" >)[QUERY (nocss)]) : (!(< "inline", "both" >)[QUERY (nocss)]);
  if (QUERY (debug))
    write ("do_css => " + test + "\n");
  return test;
}

mapping _containers = ([ 
  "body": safe_body,
  "html": dump_container,
  "title": safe_ignore_container
]);

void start () {
  containers_inline   = _containers + ([ "head": safe_ignore_container ]);
  containers_mailpart = _containers;

  tags_inline = ([
    "!doctype": safe_ignore_tag,
  ]);

  if (QUERY (allowed_containers)) {
    foreach (QUERY (allowed_containers), string t)
      containers += ([ t: safe_container ]);

    foreach (html4_containers - QUERY (allowed_containers), string t)
      containers += ([ t: safe_ignore_container ]);
  }

  if ((QUERY (nojs) == "inline") || (QUERY (nojs) == "both"))
    containers_inline += ([ "script": safe_ignore_container ]);
  else
    containers_inline += ([ "script": safe_container ]);

  if ((QUERY (nojs) == "mailpart") || (QUERY (nojs) == "both"))
    containers_mailpart += ([ "script": safe_ignore_container ]);
  else
    containers_mailpart += ([ "script": safe_container ]);

  if ((QUERY (nocss) == "inline") || (QUERY (nocss) == "both")) {
    tags_inline += ([ "link": safe_ignore_tag ]);
    foreach (html4_css, string t)
      containers_inline += ([ t: safe_ignore_container ]);
    //containers_inline += ([ t: dump_container ]);
  }
  else {
    tags_inline += ([ "link": dump_tag ]);
    foreach (html4_css, string t)
      //containers_inline += ([ t: dump_container ]);
      containers_inline += ([ t: safe_container ]);
  }

  if ((QUERY (nocss) == "mailpart") || (QUERY (nocss) == "both")) {
    tags_mailpart += ([ "link": safe_ignore_tag ]);
    foreach (html4_css, string t)
      containers_mailpart += ([ t: safe_ignore_container ]);
    //containers_mailpart += ([ t: dump_container ]);
  }
  else {
    tags_mailpart += ([ "link": dump_tag ]);
    foreach (html4_css, string t)
      //containers_mailpart += ([ t: dump_container ]);
      containers_mailpart += ([ t: safe_container ]);
  }

  if (QUERY (allowed_tags)) {
    foreach (QUERY (allowed_tags), string t)
      tags += ([ t: safe_tag ]);

    foreach (html4_tags - QUERY (allowed_tags), string t)
      tags += ([ t: safe_ignore_tag ]);
  }

  if (QUERY (debug)) {
    write(sprintf("tags_inline= %O\n", tags_inline));
    write(sprintf("tags_mailpart= %O\n", tags_mailpart));
    write(sprintf("containers_inline= %O\n", containers_inline));
    write(sprintf("containers_mailpart= %O\n", containers_mailpart));
  }
}

string status () {
  return "CAMAS HTML Module.";
}

string query_provides () {
  return "camas_html";
}

string add_base (mapping m, string ref, string content_base_abs, string content_base_rel) {
  if (!m[ref] || !strlen (m[ref]))
    return "";
  
  if (m[ref][0] == '/') {
    if (QUERY (debuglinks))
      write ("changing link (abs): " + ref + "= " + m[ref] + " => " + content_base_abs + m[ref] + "\n");
    return content_base_abs + m[ref];    
  }
  else {
    if (QUERY (debuglinks))
      write ("changing link (rel): " + ref + "= " + m[ref] + " => " + content_base_rel + m[ref] + "\n");
    return content_base_rel + m[ref];
  }
}

mapping disable_js (mapping m) {
  foreach (indices (m), string att)
    if ((sizeof (m[att]) > 2) && (lower_case (att[0..1]) == "on")) {
      if (QUERY (debug))
	write ("disabling attribute " + att + ": " + m[att] + "\n");
      m_delete (m, att);
    }

  foreach (indices (m), string att)
    if ((sizeof (m[att]) > 10) && (lower_case (m[att][0..10]) == "javascript:")) {
      if (QUERY (debug))
	write ("disabling attribute " + att + ": " + m[att] + "\n");
      m_delete (m, att);
    }

  return m;
}

int is_relative_url (string url) {
  return (sscanf (url, "%*[+-.a-zA-Z0-9]:%*s") != 2);
}

// safe_container and safe_tag used by safe_html to sort out safe tags
string safe_container (string tag, mapping m,
                       string contents, object id, mapping refparts,
		       void|string content_base_abs, void|string content_base_rel, void|int mailpart) {
  foreach (tag_references, string ref) {
    if (m[ref] && (sizeof (m[ref]) > 4) && (lower_case (m[ref][0..3]) == "cid:")) {
      refparts["<" + m[ref][4..] + ">"] = 1;
      m[ref] = id->misc->imho->nextpage + "/image?mailpart=" + http_encode_url (m[ref]);
    } 
    else {
      if (content_base_abs && m[ref] && is_relative_url (m[ref]))
	m[ref] = add_base (m, ref, content_base_abs, content_base_rel);
    }
  }
  
  if (!do_js (mailpart))
    m = disable_js (m);

  string q = make_tag_attributes (m);
  return "{2" + tag + (strlen(q) ? " " + q : "") + "}2" + contents + "{2/" + tag + "}2";
}

string safe_tag (string tag, mapping m, object id, mapping refparts,
		 void|string content_base_abs, void|string content_base_rel, void|int mailpart) {
  foreach (tag_references, string ref) {
    if (m[ref] && (sizeof (m[ref]) > 4) && (lower_case (m[ref][0..3]) == "cid:")) {
      write ("refparts: m[ref]= " + m[ref] + "\n");
      refparts["<" + m[ref][4..] + ">"] = 1;
      m[ref] = id->misc->imho->nextpage + "/image?mailpart=" + http_encode_url (m[ref]);
    }

    if (content_base_abs && m[ref] && is_relative_url (m[ref]))
      m[ref] = add_base (m, ref, content_base_abs, content_base_rel);
  }

  if (!do_js (mailpart))
    m = disable_js (m);
  
  string q = make_tag_attributes (m);

  return "{2" + tag + (strlen(q) ? " " + q : "") + " /}2";
}

string dump_tag (string tag, mapping m, object id, mapping refparts,
		 void|string content_base_abs, void|string content_base_rel, void|int mailpart) {
  string q = make_tag_attributes (m);

  return "{2" + tag + (strlen(q) ? " " + q : "") + " /}2";
}

string dump_container (string tag, mapping m,
		       string contents, object id, mapping refparts,
		       void|string content_base_abs, void|string content_base_rel, void|int mailpart) {
  string q = make_tag_attributes (m);

  return "{2" + tag + (strlen(q) ? " " + q : "") + "}2" + contents + "{2/" + tag + "}2";
}

string safe_ignore_container (string tag, mapping m,
			      string contents, object id, mapping refparts,
			      void|string content_base_abs, void|string content_base_rel, void|int mailpart) {
  if (QUERY (debug))
    write ("safe_ignore_container (" + tag + ")\n");
  return contents;
}

string safe_ignore_tag (string tag, mapping m, object id, mapping refparts,
			void|string content_base_abs, void|string content_base_rel, void|int mailpart) {
  return "";
}

string safe_body (string tag, mapping m,
		  string contents, object id, mapping refparts,
		  void|string content_base_abs, void|string content_base_rel, void|int mailpart) {
  string ret = "";

  foreach (tag_references, string ref) {
    if (m[ref] && (sizeof(m[ref]) > 4) && (m[ref][0..3] == "cid:")) {
      refparts["<" + m[ref][4..] + ">"] = 1;
      m[ref] = id->misc->imho->nextpage + "/image?mailpart=" + http_encode_url (m[ref]);
    }
  }

  if (!do_js (mailpart))
    m = disable_js (m);

  if (mailpart) {
    string q = make_tag_attributes (m);
    ret += "{2body" + (strlen(q) ? " " + q : "") + "}2" + contents + "{2/body}2";
  }
  else {
    ret += "{2table cellpadding=0 border=0 cellspacing=0";
    
    if (do_css (mailpart)) {
      if (m->bgcolor)
	ret += " bgcolor=\"" + m->bgcolor + "\"";
      if (m->text)
	ret += " color=\"" + m->text + "\"";
      if (m->background && content_base_abs) {
	if (m->background[0] == '/')
	  ret += " style=\"background-image: url('" + content_base_abs + m->background + "');\"";
	else
	  ret += " style=\"background-image: url('" + content_base_rel + m->background + "');\"";
      }
    }
    ret += "}2";
    
    ret += "{2tr}2{2td}2" + contents + "{2/td}2{2/tr}2";
    ret += "{2/table}2";
  }

  return ret;
}

// let only allowed tags through and make sure all containers are closed

string safe_html (string in, object id, mapping refparts, void|string content_base, void|int mailpart) {
  string content_base_abs = 0;
  string content_base_rel = 0;

  if (content_base) {
    if (QUERY (debuglinks))
      write ("content_base= " + content_base);
    
    sscanf (content_base, "%*[\"]%s%*[\"]", content_base);
    if (QUERY (debuglinks))
      write ("\n => " + content_base);

    //oliv3: an URL linking to a path/filename is not ok in Content-Base, but Netscape sets one...
    string scheme = "", server = "", scheme_specific_part = "";
    sscanf (content_base, "%[+-.a-zA-Z0-9]:%*[/]%s%*[/]%s", scheme, server, scheme_specific_part);

    array split = explode_path (scheme_specific_part);
    if (sscanf (split[-1], "%*s.%*s") == 2) {
      scheme_specific_part = (split[0..sizeof (split)-2] * "/");
      if (QUERY (debuglinks))
	write ("\n => " + scheme_specific_part);
    }

    if (QUERY (debug))
      write("\nscheme= " + scheme + "\nserver= " + server + "\nscheme_specific_part= " + scheme_specific_part + "\n");

    content_base_abs = scheme + "://" + server;
    content_base_rel = scheme + "://" + server + "/" + scheme_specific_part;

    if (QUERY (debuglinks))
      write ("\n => " + content_base);

    content_base = (content_base / " ") * "";

    if (QUERY(debuglinks)) write ("\n abs= " + content_base_abs + "\n");
    if (QUERY(debuglinks)) write ("rel= " + content_base_rel + "\n");
  }
  
  in = replace (in, ({ "{", "}" }), ({ "{1", "}1" }));

  mapping(string:mixed) tags = (mailpart) ? tags + tags_mailpart : tags + tags_inline;
  mapping(string:mixed) cont = (mailpart) ? containers + containers_mailpart : containers + containers_inline;

  if (id->misc->_tags)
    foreach (indices (id->misc->_tags), string t)
      tags += ([ t: safe_ignore_tag ]);
    
  if (id->misc->_containers)
    foreach (indices (id->misc->_containers), string c)
      containers += ([ c: safe_ignore_container ]);

  string parsed = parse_html (in, tags, containers,
			      id, refparts, content_base_abs, content_base_rel, mailpart);

  parsed = replace (parsed, ({ "{1", "}1", "{2", "}2" }), ({ "{", "}", "<", ">" }));

  return parsed;
}

string add_base_href (string mail, void|string content_base) {
  if (QUERY (debug))
    write ("[HTML] content_base= " + content_base + "\n");

  if (!content_base)
    return mail;

  mapping cont = ([ "head": lambda (object p, mapping m, string c) {
			      string res = "<head><base href=" + content_base + ">";
			      res += c + "</head>";
			      return ({ res });
			    } ]);

  object p = Parser.HTML ();
  p->case_insensitive_tag (1);
  p->ignore_unknown (1);
  mail = p->add_containers (cont)->finish (mail)->read ();
  destruct (p);

  return mail;
}

mapping mailpart (object id) {
  mapping refparts = ([ ]);
  object mail = id->misc->session_variables->cmail;

  if (!mail)
    return http_low_answer (404, "Mailpart not found.");

  int i = 0;
  string mp = id->variables->mailpart;

  if (sscanf (mp, "cid:%s", mp) == 1) {
    mp = "<" + mp + ">";
    array mails = ({ mail });
    while (sizeof (mails) && (mp != mail->headers["content-id"])) {
      mail = mails[0];
      mails = mails[1..];
      if (mail->body_parts)
	mails += mail->body_parts;
    }
    if (mail->headers["content-id"] == mp) {
      string data = mail->getdata () || "";
      string type = mail->type + "/" + mail->subtype;
      if (type == "text/html")
	data = add_base_href (data, mail->headers["content-base"]);
      //write ("mailpart #1\n");
      string res = safe_html (data, id, refparts, 0, 1);
      return http_string_answer (res, type);
    }
    return http_low_answer (404, "Mailpart not found.");
  }
  else {
    sscanf (mp, "%d,%s", i, mp);
    while ((i >= 0) && mail && mail->body_parts && (sizeof (mail->body_parts) > i)) {
      mail = mail->body_parts[i];
      i = -1;
      sscanf (mp, "%d,%s", i, mp);
    }
    string data = mail->getdata () || "";
    string type = mail->type + "/" + mail->subtype;
    //write ("mailpart #2\n");
    if (type == "text/html") {
      data = add_base_href (data, mail->headers["content-base"]);
      string res = safe_html (data, id, refparts, 0, 1);
      return http_string_answer (res, type);
    }
    else
      return http_string_answer (data, type);
  }
}

/* START AUTOGENERATED DEFVAR DOCS */

//! defvar: debug
//! When on, <b>general</b> debug messages will be logged in Caudium's debug logfile. This information is very useful to the developers when fixing bugs.
//!  type: TYPE_FLAG
//!  name: Debug:General debug
//
//! defvar: debuglinks
//! When on, <b>link relocation</b> debug messages will be logged in Caudium's debug logfile. This information is very useful to the developers when fixing bugs.
//!  type: TYPE_FLAG
//!  name: Debug:Debug links
//
//! defvar: nojs
//! Set this if you do not want JavaScript to pass through. <br />It will remove all 'on...' attributes in HTML elements, and the &lt;script&gt; container.
//!  type: TYPE_STRING_LIST
//!  name: Disallow JavaScript
//
//! defvar: nocss
//! Set this if you do not want style sheets to pass through. <br />Removed containers:<ul><li>&lt;span&gt;</li><li>&lt;style&gt;</li><li>&lt;div&gt;</li></ul>Removed tags:<ul><li>&lt;link&gt;</li></ul>
//!  type: TYPE_STRING_LIST
//!  name: Disallow Cascading Style Sheets
//
//! defvar: allowed_containers
//! The containers allowed to pass through.
//!  type: TYPE_CUSTOM
//!  name: Allowed HTML containers
//
//! defvar: allowed_tags
//! The tags allowed to pass through.
//!  type: TYPE_CUSTOM
//!  name: Allowed HTML tags
//
