(************************************************************
 *
 * A part of Regexp/OCaml module.
 * 
 * (c) 2002-2003 Yutaka Oiwa. All Rights Reserved.
 *
 * This file is distributed under the terms of the Q Public
 * License version 1.0.
 *
 ************************************************************)
(* $Id: parse_regexp.ml 90 2004-09-02 19:14:58Z yutaka $ *)

(* received hints from Jerome Vouillon's re library *)
open Printf

(* data type for regular expression *)

exception Parse_error of int * int * string

type regexp_tree = 
    Char of char
  | Seq of regexp_tree list
  | Repeat of bool * regexp_tree
  | Repeat1 of bool * regexp_tree
  | RepeatN of bool * int option * int option * regexp_tree
  | Optional of bool * regexp_tree
  | TaggedGroup of int ref * regexp_tree
  | UntaggedGroup of regexp_tree
  | Branch of regexp_tree list
  | PredefinedSet of char
  | Meta of char
  | Set of bool * set_element list
  | Empty

and set_element = 
    S_Char of char
  | S_Range of char * char
  | S_PredefinedSet of char

type regexp_options = 
    { caseless : bool }

type regexp = 
    regexp_options * int * regexp_tree

type parse_info = 
    { str : string;
      len : int;
      mutable pos : int;
      mutable pi_caseless : bool;
      mutable pi_extended : bool;
    }

let make_info s = 
  { str = s; len = String.length s; pos = 0;
    pi_caseless = false; pi_extended = false }

let escapable_characters = 
  " !\"#$%&'()*+,-./:;<=>?[\\]`^_{|}~"

let meta_characters = 
  "*+?()\\|{[^$."

let octal_characters = "01234567"
let decimal_characters = "0123456789"
let hexadecimal_characters = "0123456789abcdefABCDEF"

let space_characters = " \t\r\n"

let charset_list = 
  [ 'd', [S_Range ('0', '9')];
    'D', [S_Range ('\000', '/'); S_Range (':', '\255')];
    's', [S_Char ' '; S_Char '\012'; S_Char '\n'; S_Char '\r'; S_Char '\t'];
    'S', [S_Range ('\000', '\008'); S_Char '\011'; S_Range ('\014', '\031'); S_Range ('!', '\255')];
    'w', [S_Range ('0', '9'); S_Range ('A', 'Z'); S_Range('a', 'z'); S_Char '_'];
    'W', [S_Range ('\000', '/'); S_Range (':', '@'); S_Range ('[', '^');
	  S_Char '`'; S_Range ('{', '\255')]
  ]

let char_escape_list =
  [ 'a', '\007';
    'e', '\027';
    'f', '\012';
    'r', '\013';
    'n', '\010';
    't', '\009';
  ]

let error_at_last_char ~info ?(nchars = 1) str = 
  raise (Parse_error(info.pos - nchars, info.pos, str))
    
let error_at_here ~info str = 
  raise (Parse_error(info.pos, info.pos + 1, str))

let error_at_eos ~info str = 
  raise (Parse_error(info.len - 1, info.len, str))
    
let advance ~info = 
  info.pos <- info.pos + 1

let advance_n ~info n = 
  info.pos <- info.pos + n

let rec eat_comment ~info = 
  if not info.pi_extended then () else
  if info.pos >= info.len then () else
  match info.str.[info.pos] with
    '#' ->
      while(info.pos < info.len && info.str.[info.pos] <> '\n') do
	advance ~info
      done;
      eat_comment ~info
  | c when String.contains space_characters c ->
      advance ~info;
      eat_comment ~info
  | _ -> ()

let eos_no_comment ~info = 
  info.pos >= info.len
    
let get_no_comment ~info = 
  if eos_no_comment ~info then 
    error_at_eos ~info "end of regexp not expected"
  else
    let c = info.str.[info.pos] in
    advance ~info; c
	
let lookahead_no_comment ~info = 
  if eos_no_comment ~info then None
  else Some (info.str.[info.pos])

let eos ~info = 
  eat_comment ~info;
  info.pos >= info.len

let lookahead ~info = 
  if eos ~info then None
  else Some (info.str.[info.pos])

let lookahead_n ~info n = 
  if eos ~info then None
  else if info.pos + n - 1 >= info.len then None
  else Some (String.sub info.str info.pos n)

let accept ~info c = 
  if lookahead ~info = Some c then
    (advance ~info; true)
  else
    false

let accept_str ~info s = 
  let len = String.length s in
  if lookahead_n ~info len = Some s then
    (advance_n ~info len; true)
  else
    false

let get ~info = 
  match lookahead ~info with
    Some c -> 
      advance ~info; c
  | None ->
      error_at_eos ~info "end of regexp not expected"
	
let eat ~info c = 
  if get ~info <> c then
    error_at_last_char ~info (sprintf "'%c' expected" c)
  else ()

let parse_number ~info ~err = 
  let s = Buffer.create 8 in
  let rec iter () = 
    match lookahead ~info with
      Some ('0' .. '9') ->
	let c = get ~info in
	Buffer.add_char s c; iter ()
    | _ ->
	()
  in
  iter ();
  let s = Buffer.contents s in
  let l = String.length s in
  if l <= 0 then error_at_here ~info ("digit" ^ err ^ " expected")
  else if 9 < l then error_at_last_char ~info ~nchars:l ("number too large")
  else int_of_string s

let parse_repeat_number ~info = 
  let s = 
    if lookahead ~info = Some ',' 
    then None
    else Some (parse_number ~info ~err:" or , or }") in
  let e =
    if lookahead ~info = Some '}' then
      s
    else begin
      eat ~info ',';
      if lookahead ~info = Some '}' 
      then None
      else Some(parse_number ~info ~err:" or }")
    end
  in
  (match s, e with
    None, None -> error_at_here ~info "empty range {,}"
  | Some s, Some e -> if s > e then error_at_here ~info "invalid range (start > end)"
  | _ -> ());
  s, e

let rec parse_whole ~info = 
  if lookahead_n ~info 3 <> Some "(?:" then
    if accept_str ~info "(?" then
      while (not (accept ~info ')')) do
	match get ~info with
	  'i' -> info.pi_caseless <- true
	| 'x' -> info.pi_extended <- true
	| _ -> error_at_last_char ~info "unknown option"
      done;
  parse_toplevel ~info

and parse_toplevel ~info = 
  let l = ref [] in
  let e = parse_branch ~info in
  while (accept ~info '|') do
    l := parse_branch ~info :: !l;
  done;
  if !l = [] then e else Branch(e :: List.rev !l)

and parse_branch ~info = 
  let o = ref [] in
  while (not (eos ~info) 
	   && not (lookahead ~info = Some '|')
	   && not (lookahead ~info = Some ')')
	) do
    let r = ref (parse_simple ~info) in
    if (accept ~info '*') then
      r := Repeat(not (accept ~info '?'), !r)
    else if (accept ~info '+') then
      r := Repeat1(not (accept ~info '?'), !r)
    else if (accept ~info '?') then
      r := Optional(not (accept ~info '?'), !r)
    else if (accept ~info '{') then
      (let s, e = parse_repeat_number ~info in
      eat ~info '}';
      r := RepeatN(not (accept ~info '?'), s, e, !r));
    o := !r :: !o
  done;
  match List.length !o with
    0 -> Empty
  | 1 -> List.hd (!o)
  | _ -> Seq(List.rev !o)

and parse_simple ~info = 
  if accept ~info '^' then
    Meta '^'
  else if accept ~info '$' then
    Meta '$'
  else if accept ~info '.' then
    Meta '.'
  else if accept ~info '(' then begin
    let g = 
      if accept ~info '?' then begin
	eat ~info ':';
	UntaggedGroup(parse_toplevel ~info);
      end else
	TaggedGroup(ref (-1), parse_toplevel ~info)
    in
    eat ~info ')';
    g
  end
  else if accept ~info '[' then
    let r = parse_charset ~info in
    eat ~info ']';
    r
  else if accept ~info '\\' then 
    parse_backslash ~info
  else
    let c = get ~info in
    if not (String.contains meta_characters c)
    then
      Char c
    else
      error_at_last_char ~info "invalid metacharacter here"

and parse_escaped_charcode ~info prefix set set_name num prevlen =
  let s = String.make num '0' in
  for i = 0 to num - 1 do
    let c = get_no_comment ~info in
    if (not (String.contains set c)) then
      error_at_last_char ~info ~nchars:1
	(set_name ^ " character expected");
    s.[i] <- c;
  done;
  try
    Char.chr (int_of_string (prefix ^ s))
  with 
    Invalid_argument _ ->
      error_at_last_char ~info ~nchars:(num + prevlen) "bad character code"

and parse_backslash ~info = 
  let c = get_no_comment ~info in
  if String.contains escapable_characters c then
    Char c
  else if List.mem_assoc c char_escape_list then
    Char (List.assoc c char_escape_list)
  else if List.mem_assoc c charset_list then
    PredefinedSet c
  else if c = 'x' then
    Char (parse_escaped_charcode ~info
	    "0x" hexadecimal_characters "hexadecimal" 2 2)
  else if c = 'o' then
    Char (parse_escaped_charcode ~info
	    "0o" octal_characters "octal" 3 2)
  else if String.contains decimal_characters c then begin
    advance_n (-1) ~info;
    Char (parse_escaped_charcode ~info
	    "" decimal_characters "decimal" 3 1)
  end
  else
    error_at_last_char ~info "bad character after \\"

and parse_charset ~info = 
  let invert = accept ~info '^' in
  let l = ref [] in
  while(!l = [] || lookahead ~info <> Some ']') do
    let elem = 
      let elem1 = 
	if accept ~info '\\' then
	  match parse_backslash ~info with
	    Char c -> S_Char c
	  | PredefinedSet c -> S_PredefinedSet c
	  | _ -> assert false
	else
	  S_Char (get ~info)
      in
      match elem1 with
	S_PredefinedSet _ -> [elem1]
      |	S_Char c1 ->
	  if accept ~info '-' then
	    if (lookahead ~info = Some ']') then
	    (* final char *)
	      [S_Char '-'; elem1]
	    else 
	      let c2 = 
		if accept ~info '\\' then
		  match parse_backslash ~info with
		    Char c2 -> c2
		  | PredefinedSet _ -> 
		      error_at_last_char ~info "charset escape not allowed here"
		  | _ -> assert false
		else
		  get ~info
	      in
	      if c1 = c2 then
		[ elem1 ]
	      else if c1 > c2 then
		error_at_last_char ~info "bad char range (first > last)"
	      else
		[ S_Range(c1, c2) ]
	  else
	    [ S_Char c1 ]
      |	_ -> assert false
    in
    l := elem @ !l
  done;
  Set(invert, List.rev !l)
    
let rec put_tag cur = function
    Char _ | Meta _ | Set _ | PredefinedSet _ | Empty -> cur
  | Seq l | Branch l -> 
      List.fold_left 
	(fun cur re -> put_tag cur re) cur l
  | Repeat(_,t) | Repeat1(_,t) | Optional(_,t) | RepeatN(_,_,_,t)
  | UntaggedGroup(t) -> put_tag cur t
  | TaggedGroup(ir,t) ->
      ir := cur; put_tag (cur + 1) t

let put_tag r = put_tag 0 r

let parse_regexp s = 
  let info = make_info s in
  let s = parse_whole ~info in
  if (not (eos ~info)) then 
    error_at_here ~info "parse finished prematurely: why?";
  let cnt = put_tag s in
  let options = { caseless = info.pi_caseless } in
  (options, cnt, s)

type backref_kind = 
    BR_Normal
  | BR_Optional
  | BR_List

let make_backref_table (_, num, t) = 
  let table = Array.make num BR_List in (* just paranoia default *)
  let rec iter kind = function
      Char _ | Meta _ | Set _ | PredefinedSet _ | Empty -> ()
    | Seq l ->
	List.iter (iter kind) l
    | Branch l -> 
	List.iter (iter (if kind = BR_Normal then BR_Optional else kind)) l
    | RepeatN(_,Some 1,Some 1,t) ->
	iter kind t
    | Optional(_,t) | RepeatN(_,_,Some (0|1),t) (* {,0} {,1} {0,0} {0,1} *)
      -> iter (if kind = BR_Normal then BR_Optional else kind) t
    | Repeat(_,t) | Repeat1(_,t) | RepeatN(_,_,_,t)
      -> iter BR_List t
    | UntaggedGroup(t) -> iter kind t
    | TaggedGroup(ir,t) ->
	table.(!ir) <- kind; iter kind t
  in
  iter BR_Normal t;
  table

(* level : 
   0: printing toplevel
   1: printing branch ('|' forbidden)
   2: printing ('*?' | seq forbidden) *)

let pcre_put_escaped_char buf metas c = 
  match c with
    ' ' .. '~' ->
    if String.contains metas c then
      Buffer.add_char buf '\\';
    Buffer.add_char buf c
  | '\n' -> 
      Buffer.add_string buf "\\n"
  | _ ->
      bprintf buf "\\x%02x" (Char.code c)
	

let rec pcre_string_of_regexp buf level = 
  function
      Char c ->
	pcre_put_escaped_char buf meta_characters c
    | Seq s ->
	if level > 1 then raise Exit;
   	List.iter (pcre_string_of_regexp buf 2) s
    | Repeat(greedy, r) ->
	if level > 2 then raise Exit;
	pcre_string_of_regexp buf 3 r;
	Buffer.add_char buf '*';
	if (not greedy) then Buffer.add_char buf '?'
    | Repeat1(greedy, r) ->
	if level > 2 then raise Exit;
	pcre_string_of_regexp buf 3 r;
	Buffer.add_char buf '+';
	if (not greedy) then Buffer.add_char buf '?'
    | RepeatN(greedy, s, e, r) ->
	if level > 2 then raise Exit;
	pcre_string_of_regexp buf 3 r;
	Buffer.add_char buf '{';
	(match s with
	  Some s -> Buffer.add_string buf (string_of_int s);
	| None -> ());
	if s <> e then begin
	  Buffer.add_char buf ',';
	  match e with
	    Some e -> Buffer.add_string buf (string_of_int e);
	  | None -> ()
	end;
	Buffer.add_char buf '}';
	if (not greedy) then Buffer.add_char buf '?'
    | Optional(greedy, r) ->
	if level > 2 then raise Exit;
	pcre_string_of_regexp buf 3 r;
	Buffer.add_char buf '?';
	if (not greedy) then Buffer.add_char buf '?'
    | TaggedGroup(_, t) ->
	Buffer.add_char buf  '(';
	pcre_string_of_regexp buf 0 t;
	Buffer.add_char buf ')'
    | UntaggedGroup t ->
	Buffer.add_string buf "(?:";
	pcre_string_of_regexp buf 0 t;
	Buffer.add_char buf ')'
    | Branch [] ->
	raise Exit;
    | Branch (h::t) ->
	if level > 0 then raise Exit;
	pcre_string_of_regexp buf 1 h;
	List.iter 
	  (fun r -> Buffer.add_char buf '|';
	    pcre_string_of_regexp buf 1 r) t
    | PredefinedSet(c) ->
	if List.mem_assoc c charset_list then
	  (Buffer.add_char buf '\\'; Buffer.add_char buf c)
	else
	  raise Exit
    | Meta(c) -> begin
	match c with
	  '^' | '$' | '.' -> Buffer.add_char buf c
	| _ -> raise Exit
    end
    | Set(invert, l) ->
	Buffer.add_char buf '[';
	if invert then Buffer.add_char buf '^' else ();
	List.iter 
	  (function
	      S_Char c ->
		pcre_put_escaped_char buf "\\[]-" c
	    | S_Range(f,t) ->
		pcre_put_escaped_char buf "\\[]-" f;
		Buffer.add_char buf '-';
		pcre_put_escaped_char buf "\\[]-" t;
	    | S_PredefinedSet(c) ->
		if List.mem_assoc c charset_list then
		  (Buffer.add_char buf '\\'; Buffer.add_char buf c)
		else
		  raise Exit) 
	  l;
	Buffer.add_char buf ']'
    | Empty -> ()

let pcre_string_of_regexp (options,_,re) =
  let buf = Buffer.create 16 in
  if options.caseless then Buffer.add_string buf "(?i)";
  pcre_string_of_regexp buf 0 re;
  Buffer.contents buf

