# *********************************************************************
# jointable.awk: joins two NoSQL tables on one or more common fields.
#
# Copyright (c) 1998,2006 Carlo Strozzi
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# *********************************************************************
# $Id: jointable.awk,v 1.2 2006/03/10 11:26:13 carlo Exp $

BEGIN {
  NULL = ""; FS = OFS = "\t"; dlm = "\001"

  # Note: former versions of this program used ":" as delimiter in
  # multi-field joins. Unfortunately, fields that sort correctly
  # when taken individually may no longer sort when attached to
  # each-other by ":", at least in some circumstances, and thus
  # join(1) does no longer work correctly in such cases. It looks
  # like the linking character must sort as low as possible in
  # the ASCII table for things to work, so I have now changed it
  # to octal "\001". That is, it looks like that if the delimiting
  # character sorts higher than the field that follows it, then the
  # attached fields sort differently than their individual values.
  #
  # To see what I mean, try this:
  #
  # LC_ALL=POSIX sort <<EOF
  # 5_1
  # 5_11
  # 5_12
  # EOF
  #
  # and then this:
  #
  # LC_ALL=POSIX sort <<EOF
  # 5_1:1
  # 5_11:1
  # 5_12:1
  # EOF


  # Exit with an error if required parameters are not set.
  if (fifo1 == NULL || fifo2 == NULL) exit(rc=1)

  # Get local settings.
  nosql_install = ENVIRON["NOSQL_INSTALL"]
  stdout = ENVIRON["NOSQL_STDOUT"]
  stderr = ENVIRON["NOSQL_STDERR"]

  # Set default values if necessary.
  if (nosql_install == NULL) nosql_install = "/usr/local/nosql"
  if (stdout == NULL) stdout = "/dev/stdout"
  if (stderr == NULL) stderr = "/dev/stderr"

  if (ENVIRON["NOSQL_DEBUG"] == 1) debug = 1

  # Separate join(1) options from column names.

  while (ARGV[++i] != NULL) {
    # Turn long options into their short form.
    if (ARGV[i] == "-a" || ARGV[i] == "--all") ARGV[i] = "-a1"
    else if (ARGV[i] == "--column") ARGV[i] = "-j"
    else if (ARGV[i] == "--debug") ARGV[i] = "-x"
    else if (ARGV[i] == "--help") ARGV[i] = "-h"
    else if (ARGV[i] == "--numeric") {
	ARGV[i] = "-n"
	dlm = NULL
    }
    else if (ARGV[i] == "--ignore-case") ARGV[i] = "-i"

    # Now process nosql options.

    if (ARGV[i] == "-j") {
       jlist = ARGV[++i]
       jn = split(jlist,jc,",")
       jlist = "," jlist ","
    }

    else if (ARGV[i] == "-x") debug = 1
    else if (ARGV[i] == "-h") {
       system("grep -v '^#' " nosql_install "/help/jointable.txt")
       exit(rc=1)
    }
    else if (ARGV[i] == "--show-copying") {
       system("cat " nosql_install "/doc/COPYING")
       exit(rc=1)
    }
    else if (ARGV[i] == "--show-warranty") {
       system("cat " nosql_install "/doc/WARRANTY")
       exit(rc=1)
    }
    else if (ARGV[i] == "-n") join_args = join_args " " ARGV[i]
    else if (ARGV[i] == "-i") join_args = join_args " " ARGV[i]
    else if (ARGV[i] == "-a1") join_args = join_args " " ARGV[i]
    else if (ARGV[i] !~ /^-/ || ARGV[i] == "-") tbl[++j] = ARGV[i]
  }

  ARGC = 1					# Fix argv[]

  # If only one file-name was specified it is used as 'table_2'.

  if (j == 1 && tbl[j] != "-") {
     tbl[++j] = tbl[1]
     tbl[1] = "-"
  }

  if (j != 2) {
     print "usage: jointable [options] table_1 table_2" > stderr
     exit(rc=1)
  }

  # Only one table may be on stdin.
  if (tbl[1] == "-" && tbl[2] == "-") {
     print "jointable: only one table can be on STDIN" > stderr
     exit(rc=1)
  }

  # Neither table is on stdin.
  if (tbl[1] != "-" && tbl[2] != "-") {
     ARGV[1] = tbl[1]
     ARGC = 2
     tbl[1] = "-"

     if (!jn) {

	# Infer the input key column names from file name, unless they
	# have been explicitly specified on the command line.

	jlist = ARGV[1]

	# Handle both table and index file names (the index first!). Note
	# that _k and _x were choosen because no real column name can begin
	# with an uderscore, so there's no risk of ambiguities. Note also
	# that we need to strip everything up to _x first, as in index 
	# files the actual key columns are those that come after _x, and   
	# they may not necessarily be the same as the key columns of the       
	# main table. That is, given the main table 'table._k.col1.col2',
	# it is quite possible to have an index file name like this:
	# 'table._k.col1.col2._x.col3.col4.col5

	if (sub(/.*\._x\./,"",jlist) || sub(/.*\._k\./,"",jlist)) {
	   gsub(/\./,",",jlist)

	   # If the file name has a "-suffix" then it may no longer be
	   # a plain table, but rather an edit buffer, a muxed file etc.
	   # I can therefore no longer assume that it is sorted on the
	   # columns that are listed in the file name, so I skip automatic
	   # key detection.

	   # remove possible "-suffix".
	   if (sub(/-.*$/,"",jlist)) {
	      jn = split(jlist,jc,",")
	      jlist = "," jlist ","
	   }
	   else jlist = NULL
	}
	else jlist = NULL
     }
  }

  # Get column names of table that is _not_ on stdin.
  if (tbl[1] != "-") {

    filein = tbl[1]
    streamout = fifo2; fileout = fifo1;

    getline < filein     	  	# Column names
    gsub(/ +/,NULL)			# remove blanks from names.
    tmpbuf = "\001_\t" $0		# out fifo non yet created.
    gsub(/\001/,"")			# Remove SOH markers.

    # Get column names and positions.
    # Make sure we pick the first occurrence of duplicated column
    # positions (it may happen after a join).

    while (++p <= NF) {
      if (P1[$p] == NULL) P1[$p] = p
      N1[p] = $p
    }
  }
  else {

    filein = tbl[2]
    streamout = fifo1; fileout = fifo2;

    getline < filein			# Column names
    gsub(/ +/,NULL)			# remove blanks from names.
    tmpbuf = "\001_\t" $0		# out fifo non yet created.
    gsub(/\001/,"")			# Remove SOH markers

    # Get column names and positions.
    # Make sure we pick the first occurrence of duplicated column
    # names (it may happen after a join).

    while (++p <= NF) {
      if (P2[$p] == NULL) P2[$p] = p
      N2[p] = $p
    }
  }

  # Infer the input key column names from file name, unless they
  # have been explicitly specified on the command line. See also
  # previous similar block.

  if (!jn) {
     jlist = filein

     if (sub(/.*\._x\./,"",jlist) || sub(/.*\._k\./,"",jlist)) {
	gsub(/\./,",",jlist)
	if (sub(/-.*$/,"",jlist)) {	# remove possible "-suffix".
	   jn = split(jlist,jc,",")
	   jlist = "," jlist ","
	}
	else jlist = NULL
     }
     else jlist = NULL
  }
}

#
# Main loop
#

# Get column names of table that _is_ on stdin.
NR == 1 {
  gsub(/\001/,"")		# Remove SOH markers
  p = 0
  out_hdr = $0
  if (tbl[1] == "-") {

    # Get column names and positions.
    # Make sure we pick the first occurrence of duplicated column
    # positions (it may happen after a join).

    while (++p <= NF) {
      if (P1[$p] == NULL) P1[$p] = p
      N1[p] = $p
    }
  }
  else {

    # Get column names and positions.
    while (++p <= NF) { P2[$p] = p; N2[p] = $p }
  }

  # Set default join column (always from table_1) if not specified.

  if (!jn) {
     jc[1] = N1[1]
     jlist = "," N1[1] ","
     jn = 1
  }

  # make sure that the specified join column(s) exists in both tables.

  for (i=1; i<=jn; i++) {
      if (P1[jc[i]] == NULL) {
	 print "jointable: column '" jc[i] "' not found in table " tbl[1] > stderr
	 exit(rc=1)
      }
      if (P2[jc[i]] == NULL) {
	 print "jointable: column '" jc[i] "' not found in table " tbl[2] > stderr
	 exit(rc=1)
      }
  }

  # Build output field list for join(1).

  field_list = "1." P1[jc[1]] + 1

  for (i=2; i<=jn; i++)
  	field_list = field_list ",1." P1[jc[i]] + 1

  while (N1[++c] != NULL) {
     jre = "," N1[c] ","
     if (N1[c] !="." && jlist !~ jre)
      field_list = field_list ",1." P1[N1[c]] + 1
  }

  c = 0
  while (N2[++c] != NULL) {
     jre = "," N2[c] ","
     if (N2[c] != "." && jlist !~ jre)
      field_list = field_list ",2." P2[N2[c]] + 1
  }

  CMD = "export LC_ALL=POSIX;join -t '\t' " \
  	join_args " -o " field_list " " fifo1 " " fifo2

  # Print 1st table header.
  gsub(/\t/,"\t\001", out_hdr)
  print "\001_\t\001" out_hdr > streamout
  print tmpbuf > fileout
}


NR > 1 {
  for (i=1; i<=jn; i++) {
    if (i > 1) printf("%s", dlm) > streamout
    if (filein == tbl[1]) printf("%s", $P2[jc[i]]) > streamout
    else printf("%s", $P1[jc[i]]) > streamout
  }
  printf("\t%s\n", $0) > streamout

  if (getline < filein > 0) {			# read ahead.

     for (i=1; i<=jn; i++) {
       if (i > 1) printf("%s", dlm) > fileout
       if (filein == tbl[1]) printf("%s", $P1[jc[i]]) > fileout
       else printf("%s", $P2[jc[i]]) > fileout
     }
     printf("\t%s\n", $0) > fileout
  }
}

END {

  if (rc == 0) {

     # print out what is left of the table which is not on stdin.

     while (getline < filein > 0) {
       for (i=1; i<=jn; i++) {
	 if (i > 1) printf("%s", dlm) > fileout

	 if (filein == tbl[1]) printf("%s", $P1[jc[i]]) > fileout
	 else printf("%s", $P2[jc[i]]) > fileout
       }
       printf("\t%s\n", $0) > fileout
     }

     if (debug) print CMD > stderr
     exit(rc=system(CMD))
  }

  exit(rc)
}

# End of program
