/* -*- Mode: C; -*- */

/* Copyright (C) beingmeta inc, 2001-2002
   Implemented by Ken Haase as part of FramerD

   This implements optimized repacking of file indices with scheduling
   of file reads to minimize disk accesses.  It also removes duplicate
   values and sorts based on OID id.

   $Id: repack-file-index.c,v 1.13 2002/06/03 22:14:18 haase Exp $ */

#include <framerd/indextools.h>
#include <limits.h>
#ifndef PATH_MAX
#define PATH_MAX 1023
#endif

static void copy_binary_file(char *from,char *to)
{
  FILE *in=fd_fopen_locked(from,"r+b",0), *out;
  int  bufsize=65536, ret_value=0, bytes=0;
  char *buf=fd_xmalloc(bufsize), *realname=NULL;
  if (fd_symbolic_linkp(to)) {
    realname=fd_get_real_pathname(to);
    out=fd_fopen_locked(realname,"wb",0);}
  else out=fd_fopen_locked(to,"wb",0);
  if (errno) {perror("Start of binary copy"); FD_CLEAR_ERR();}
  if (in == NULL) 
    fd_raise_detailed_exception(fd_FileOpenFailed,from);
  else if (out == NULL) 
    fd_raise_detailed_exception(fd_FileOpenWFailed,to);
  else while ((ret_value=fread(buf,sizeof(char),bufsize,in)) ||
	      (!(feof(in)))) {
    bytes=bytes+ret_value; fwrite(buf,sizeof(char),ret_value,out);}
  fclose(out); fclose(in); free(buf);
  if (realname) fd_xfree(realname);
}

static void describe_usage()
{
  fprintf(stderr,
  "Usage: repack-file-index [--min min] [--max max] [--block bsize] [--size size] <in> [out]\n");
  fd_exit(1);
}

int main(int argc,char *argv[])
{
  struct FD_ASSOC *assocs;
  time_t make, repack, change;
  fd_lisp metadata; int major_version, minor_version;
  int i=0, n_keys, min=0, max=-1, bsize_max=4*65536;
  int r_off,  base, n_args, need_copy=0, new_size=-1, old_size;
  char *infile=NULL, *outfile=NULL, tmpbuf[PATH_MAX]="/tmp/fdrpiXXXXXX";
  char *filebase;
  FILE *in, *out;
  if (argc < 2) describe_usage();
  i=1; while (i < argc)
    if (argv[i][0] == '-') {
      if (strcmp(argv[i],"-q") == 0) {
	fd_disable_notifications(); i++;}
      else if (strcmp(argv[i],"-v") == 0) i++;
      else if (strcmp(argv[i],"--max") == 0) {
	errno=0; max=strtol(argv[i+1],NULL,10); i=i+2;
	if (errno) {
	  fprintf(stderr,"Bad arg to --max (%s) [%s]\n",
		  argv[i+1],strerror(errno));
	  exit(1);}}
      else if (strcmp(argv[i],"--min") == 0) {
	errno=0; min=strtol(argv[i+1],NULL,10); i=i+2;
	if (errno) {
	  fprintf(stderr,"Bad arg to --min (%s) [%s]\n",
		  argv[i+1],strerror(errno));
	  exit(1);}}
      else if (strcmp(argv[i],"--bsize") == 0) {
	errno=0; bsize_max=strtol(argv[i+1],NULL,10); i=i+2;
	if (errno) {
	  fprintf(stderr,"Bad arg to --bsize (%s) [%s]\n",
		  argv[i+1],strerror(errno));
	  exit(1);}}
      else if (strcmp(argv[i],"--size") == 0)
	if (strcmp(argv[i],"keep") == 0) {
	  new_size=0; i=i+2;}
	else {
	  errno=0; new_size=strtol(argv[i+1],NULL,10); i=i+2;
	  if (errno) {
	    fprintf(stderr,"Bad arg to --size (%s) [%s]\n",
		    argv[i+1],strerror(errno));
	    exit(1);}}
      else {
	describe_usage();
	exit(1);}}
    else if (outfile) {
      describe_usage(); exit(1);}
    else if (infile) outfile=argv[i++];
    else infile=argv[i++];
  fd_initialize_framerd();
  filebase=fd_basename(infile,1);
  /* r+b is neccessary here */
  in=fd_fopen_locked(infile,"r+b",0);
  if (in == NULL) {
    fd_fprintf(stderr,"Error: file %s does not exist\n",infile);
    fd_exit(1); return 1;}
  if ((outfile) && (strcmp(outfile,infile) != 0))
    out=fd_fopen_locked(outfile,"wb",0);
  else {out=fd_fopen_tmpfile(tmpbuf,"wb"); need_copy=1;}
  if (fd_fread_4bytes(in) != FD_FILE_INDEX_MAGIC_NUMBER) {
    fprintf(stderr,"The file %s is not a file index!\n",infile);
    fclose(in); fclose(out); exit(1);}
  else old_size=fd_fread_4bytes(in);
  metadata=fd_read_file_index_metadata
    (in,&major_version,&minor_version,&make,&repack,&change);
  fd_notify("Repacking file index %s, version %d:%d",
	    infile,major_version,minor_version);
  assocs=fd_read_assocs_from_index(in,&n_keys,&r_off,0,min,max,filebase);
  fd_sort_assocs_by_n_values(assocs,n_keys);
  if (new_size < 0) 
    if (n_keys == 0) new_size=old_size;
    else if (n_keys < old_size/10) new_size=old_size;
    else new_size=fd_select_table_size(n_keys*3);
  else if (new_size == 0) /* Means keep the old size */
    new_size=old_size;
  else if (new_size < n_keys)
    new_size=fd_select_table_size(n_keys*2);
  else {} /* Use the specified size */  

  /* Copy the values in blocks */
  {
    int start=0, pos, new_slots=
      fd_start_file_index(out,new_size,metadata,major_version+1,
			  ((make<0) ? (0) : make),((change < 0) ? (0) : (change)));
    int n_vals=0, vals_copied=0;
    /* Compute the total number of values */
    int i=0; while (i < n_keys) n_vals=n_vals+assocs[i++].n_values;
    fd_notify("Coping %d values over %d keys",n_vals,n_keys);
    /* The pos value is offset by new_slots*4, and we use
       ftell because metadata of arbitrary length may have
       been written by fd_start_file_index(). */
    pos=ftell(out)-new_slots*4; while (start < n_keys) {
      char sbuf[32];
      int finish=start+1, bsize=assocs[start].n_values, npos;
      while ((finish < n_keys) &&
	     (bsize+assocs[finish].n_values < bsize_max)) {
	bsize=bsize+assocs[finish].n_values; finish=finish+1;}
      npos=fd_copy_assoc_values(assocs+start,finish-start,in,out,pos,r_off);
      vals_copied=vals_copied+bsize;
      sprintf(sbuf,"%.2f%%",((vals_copied*100.0)/n_vals));
      fd_notify("%s: %s: copied +%d/+%d/+%d keys/values/bytes",
		filebase,sbuf,finish,vals_copied,(npos-pos));
      fd_notify("%s: %s: total  %d/%d/%d (+%d/+%d/+%d) keys/values/bytes",
		filebase,sbuf,finish-start,bsize,npos-pos);
      start=finish; pos=npos;}
    /* Write out the keys */
    fd_write_keys_to_index(out,assocs,n_keys,new_slots,pos,filebase);}
  fd_notify("New file index has version info %d:%d",
	    major_version+1,ftell(out));
  /* Close the files streams */
  fclose(in); fclose(out);
  if (need_copy) {
    fd_notify(_("Copying output file back onto original"));
    copy_binary_file(tmpbuf,infile);}
  fd_exit(0);
  return 0;
}


/* File specific stuff */

/* The CVS log for this file
   $Log: repack-file-index.c,v $
   Revision 1.13  2002/06/03 22:14:18  haase
   Shortened lines of progress reports

   Revision 1.12  2002/06/03 21:51:21  haase
   Progress reports now provide more context

   Revision 1.11  2002/04/22 14:23:08  haase
   Added extended metadata to file pools and indices

   Revision 1.10  2002/04/12 15:42:32  haase
   Repacking a linked file now copies into the target rather than overwriting the link

   Revision 1.9  2002/04/11 19:42:13  haase
   Fixed bug where repack-file-index copied its results back into argv[0] --- typically 'repack-file-index' --- rather than the input file

   Revision 1.8  2002/04/10 03:02:10  haase
   Added version information to file pools and indices

   Revision 1.7  2002/04/03 01:33:09  haase
   Moved indextools out of FD_SOURCE core

   Revision 1.6  2002/04/02 21:39:32  haase
   Added log and emacs init entries to C source files

*/

/* Emacs local variables
;;;  Local variables: ***
;;;  compile-command: "cd ../..; make" ***
;;;  End: ***
*/
