#!/usr/bin/perl
##############################################################
# tGenerator validation script: checkKeys.pl
# 
# Stephen W. Thomas
# sthomas@cs.queensu.ca
# http://research.cs.queensu.ca/~sthomas/
# Software Analysis and Intelligence Lab (SAIL)
# School of Computing, Queen's University, Canada
#
# Checks the keys for consisteny in the .csv files generated by tGenerator.
#
##############################################################
use strict;


# Check command-line argumets
if ($#ARGV != 0 ){
   print "$0: Error: Incorrect usage.\n";
   print "Usage: $0 directory_with_data\n";
   print "Exiting.\n";
   exit 1;
}

# The directory that holds all the .csv files
my $dir = $ARGV[0];

my @FILES = `ls $dir/item_author.*`;
for my $file (@FILES){
    chomp $file;
    checkFile($file);
}
my @FILES = `ls $dir/related_item.*`;
for my $file (@FILES){
    chomp $file;
    checkFile($file);
}
my @FILES = `ls $dir/item_publisher.*`;
for my $file (@FILES){
    chomp $file;
    checkFile($file);
}

checkFile2("$dir/item.final.csv");
#checkFile("$dir/item_author.final.csv");
#checkFile("$dir/item_publisher.final.csv");
#checkFile("$dir/related_item.final.csv");
checkDates("$dir/publisher.final.csv");
checkDates("$dir/author.final.csv");
checkDates("$dir/item.final.csv");
checkDates("$dir/related_item.final.csv");
checkDates("$dir/item_author.final.csv");
checkDates("$dir/item_publisher.final.csv");

sub checkFile(){
my $file = shift;
my %hash;

open(FILE, "<$file") or die();
my $lineNo = 1;
while (<FILE>){
    (my $iID, my $pID, my $b) = split(/&/,$_);
    
    if (defined $hash{$iID}{$pID}{$b}){
        print "\nERROR: duplicate entries in $file\n";
        print "$iID, $pID, $b\n";
        print "lines $hash{$iID}{$pID}{$b} and $lineNo\n";
        return;
    } else {
        $hash{$iID}{$pID}{$b} = $lineNo;
        ++$lineNo;
    }
}

print "$file looks ok for duplicate keys\n";
}

sub checkFile2(){
my $file = shift;
my %hash;

open(FILE, "<$file") or die();
while (<FILE>){
    (my $iID, my $dummy) = split(/&/,$_);
     my $lastPart = substr $_, -22;
    (my $s, my $e) = split(/&/,$lastPart);
     chomp $e;
    
    if (defined $hash{$iID}{$s} or defined $hash{$iID}{$e}){
        print "\nERROR: duplicate entries in $file\n";
        print "$iID, $s, $e\n";
        return;
    } else {
        $hash{$iID}{$s}{$e} = 1;
    }
}
print "$file looks ok for duplicate keys\n";
}


sub checkDates(){
    my $file = shift;
    open(FILE, "<$file") or die("Cant open $file");
    my $lineNo = 1;
    while (<FILE>){
        my $lastPart = substr $_, -22;
        (my $s, my $e) = split(/&/,$lastPart);
        chomp $e;
        if ($e lt $s){
            print "\nERROR: end time $e is less than start time $s in file $file\n";
            print "lines $lineNo\n";
            return;
        }
        ++$lineNo;
     }
print "$file looks ok for start and end times\n";
}
