
//
// dftime.cc  time efficient version
// C++ program that finds frequent itemsets - file dftime.cc
// September 26, 2003
// November 17, 2003: changed  unsigned short count  into  int count
// Wim Pijls and Walter Kosters
// Erasmus University Rotterdam and Leiden University, The Netherlands
// pijls@few.eur.nl and kosters@liacs.nl
// http://www.liacs.nl/home/kosters/df/
// 
// makefile: g++ -Wall -O3 -o fim_all dftime.cc
//
// Assume that the relevant database and the trie containing all frequent
// sets together fit in main memory; relevant here means: all transactions
// that contain at least 2 frequent items (the so-called relevant
// transactions), whereas only frequent items are considered. Every byte 
// contains 8 bits of the database. So its size is the number of 
// frequent items times the number of relevant transactions divided by 8.
//
// The data is read four times:
// 1. to find minimal and maximal item number that occurs
// 2. to determine the frequencies of the single item numbers
// 3. to find the relevant transactions (containing at least 2 frequent items)
// 4. to store the (relevant) database in main memory
// 

// name inputfile:
#define input_filename argv[1]
// (absolute) value of minsup:
#define macro_minsup atoi(argv[2])
// name outputfile:
#define output_filename argv[3]

#include <iostream>
#include <fstream>
#include <cstdio>
#include <cstdlib>
#include <ctime>
using namespace std;

typedef char *transaction;
const int MAXDEPTH = 100; // maximal depth of trie (for printing)

int minsup;               // minimum support
int therow;               // number of currect transaction
transaction globpointer;  // current transaction
transaction *dataset;     // the whole database
char vector[8];           // masks for fast
char *supervector;        //   array addressing

class initialcounts
{
  public:
    initialcounts (char *inputdata);
    static int *itemsorder;
    static int *items_frequency;
    static short *ranking;
    static int min_itemnr, max_itemnr;
    static int number_transactions, number_freq_items, lines;
  private:
    void first_pass ( );
    void second_pass ( );
    void third_pass ( );
    void initialsort ( );
    char *infilename;
    int *init_items_frequency;
};

class data_array
{
  public:
    data_array (char *inputdata);
  private:
    char *next_transaction;
    void inputread (char *inputdata);
};

struct bucket
{
  short itemvalue;
  int count;  // used to be  unsigned short count;
  short number_followers;
  struct bucket *next;
};

void count (bucket *trienode, short number_buckets);

// does currect transaction (globpointer) contain item "column"?
inline int inspect (int column)
{
  return ( ( globpointer[column >> 3] & supervector[column] ) );
}//inspect

class trie
{
  public:
    trie ( ) { };
    trie (initialcounts & initialdata, data_array & datagrid);
    void build_up ( );
    void printtrie (char *outputdata);
  private:
    int length_count[MAXDEPTH];
    void extend (int k);
    void cleanup (bucket * & trienode, short & number);
    FILE *outfilename;
    struct bucket *root;
    int triesize;
    int results[MAXDEPTH];
    void copying (struct bucket *p, struct bucket *q, int number_q_buckets);
    void printout (int depth, struct bucket *trienode, int number_buckets);
};

int *initialcounts::items_frequency = NULL;
short *initialcounts::ranking = NULL;
int *initialcounts::itemsorder = NULL;
int initialcounts::number_transactions = 0;
int initialcounts::lines = 0;
int initialcounts::number_freq_items = 0;
int initialcounts::min_itemnr = 0;
int initialcounts::max_itemnr = 0;

// constructor
initialcounts::initialcounts (char *inputdata)
{
  infilename = inputdata;
  first_pass ( );
  second_pass ( );
  third_pass ( );
  initialsort ( );
}//initialcounts::initialcounts

// computes minimal and maximal item number that occur in the database;
// if these are known in advance, this function can be easily adapted
// function reads whole file!
void initialcounts::first_pass ( )
{
  int itemnr;
  bool first = true;
  ifstream fin (infilename);
  if ( ! fin ) 
    cout << "No such filename" << endl;
  char c; 
  int pos;
  do {
    do {
      fin.get (c);
      itemnr = 0;
      pos = 0;
      while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) )
      {
        itemnr = 10*itemnr + (int)(c) - (int)('0');
        pos++;
        fin.get (c);
      }//while
      if ( pos ) {
	if ( first )
          max_itemnr = min_itemnr = itemnr;
	first = false;
        if ( itemnr < min_itemnr )
          min_itemnr = itemnr;
        else if ( itemnr > max_itemnr )
          max_itemnr = itemnr;
      }//if
    } while ( c != '\n' && ! fin.eof ( ) );
  } while ( ! fin.eof ( ) );
  fin.close ( );
}//initialcounts::first_pass

// determines frequency for all items, and number of frequent items
// function reads whole file!
void initialcounts::second_pass ( )
{
  int k;
  ifstream fin (infilename);
  if ( ! fin ) 
    cout << "No such filename" << endl;
  int itemrange = max_itemnr-min_itemnr+1;
  init_items_frequency = new int[itemrange];
  for ( k = 0; k < itemrange; k++ ) 
    init_items_frequency[k] = 0;
  char c; 
  int item, pos;
  do {
    do	{
      fin.get (c);
      item = 0;
      pos = 0;
      while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) )
      {
        item = 10*item + (int)(c) - (int)('0');
        pos++;
        fin.get (c);
      }//while
      if ( pos )
        init_items_frequency[item-min_itemnr]++;
    } while ( c != '\n' && ! fin.eof ( ) );
  } while ( ! fin.eof ( ) );
  fin.close ( );
  for ( k = 0; k < itemrange; k++ )
    if ( init_items_frequency[k] >= minsup )
      number_freq_items++;
//  printf ("Number of frequent items: %d\n", number_freq_items);
}//initialcounts::second_pass

// determines number of relevant transactions, i.e., those
// that contain at least two frequent items
// (those that contain one frequent item have already been accounted for
// while determining the frequency of the single items)
// function reads whole file!
void initialcounts::third_pass ( )
{
  number_transactions = 0;
  ifstream fin (infilename);
  if ( ! fin) 
    cout << "No such filename" << endl;
  char c;
  int item, pos, items_in_trans;
  bool line;
  do {  
    line = false;
    items_in_trans = 0;
    do {
      fin.get (c);
      item=0;
      pos=0;
      while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) )
      {
        item = 10*item + (int)(c) - (int)('0');
        pos++;
        line = true;
        fin.get (c);
      }//while
      if ( pos && init_items_frequency[item-min_itemnr] >= minsup )
        items_in_trans++;
    } while ( c != '\n' && ! fin.eof ( ) );
    if ( line ) 
      lines++;
    if ( items_in_trans >= 2 ) 
      number_transactions++;
  } while ( ! fin.eof ( ) );
//  printf ("Number of relevant transactions: %d\n", number_transactions);
//  printf ("Number of lines: %d\n", lines);
  fin.close ( );
}//initialcounts::third_pass

// sort items with respect to support - and renumber
void initialcounts::initialsort ( )
{
  int left_cursor = 0;
  int right_cursor = max_itemnr-min_itemnr;
  int itemrange = right_cursor+1;
  int *items_numbers;
  int i, j, k;
  items_numbers = new int[number_freq_items];
  for ( k = 0; k < number_freq_items; k++ )
    items_numbers[k] = k;
  while ( left_cursor < right_cursor )
  {
    while ( init_items_frequency[left_cursor] >= minsup 
	   && left_cursor<number_freq_items ) 
      left_cursor++;
    while ( init_items_frequency[right_cursor] < minsup ) 
      right_cursor--;
    if ( left_cursor < right_cursor )
    {
      init_items_frequency[left_cursor] = init_items_frequency[right_cursor];
      items_numbers[left_cursor] = right_cursor;
    }//if
    right_cursor--;
  }//while
  ranking = new short[itemrange];
  for ( k = 0; k < itemrange; k++ ) 
    ranking[k] = -1;
  itemsorder = new int[number_freq_items];
  items_frequency = new int[number_freq_items];
  int maxfrequency, bestitem = 0, bestindex = 0;
  int *used;
  used = new int[number_freq_items];
  for ( i = 0; i < number_freq_items; i++ ) 
    used[i] = 0;
  for ( int rank = number_freq_items-1; rank >= 0; rank-- )
  {
    maxfrequency = -1;
    for ( j = 0; j < number_freq_items; j++ )
      if ( used[j] == 0 && init_items_frequency[j] > maxfrequency )
      { 
	bestindex = j;
        bestitem = items_numbers[j];
        maxfrequency = init_items_frequency[j];
      }//if
    itemsorder[rank] = bestitem+min_itemnr;
    ranking[bestitem] = rank;
    items_frequency[rank] = maxfrequency;
    used[bestindex] = 1;
  }//for
}//initialcounts::initialsort

// constructor
data_array::data_array (char *inputdata)
{
  int v = 1;
  vector[0] = (char) v;
  for ( int k = 1; k < 8; k++ )
  {
    v = 2*v;
    vector[k] = (char) v;
  }//for
  supervector = new char[initialcounts::number_freq_items];
  for ( int col = 0; col < initialcounts::number_freq_items; col++ )
    supervector[col] = vector[col & 7];
  dataset = new transaction[initialcounts::number_transactions];
  inputread (inputdata);
}//data_array::data_array

// reads the entire datafile from file inputdata and
// puts in into a 2-dimensional character array (eight 0/1's per byte)
// reads whole file (for the fourth time)!
void data_array::inputread (char *inputdata)
{  
  int pos, item, items_count;
  short newitemnr;
  char c;
  ifstream fin (inputdata);
  if ( ! fin ) 
    cout << "No such filename" << endl; 
  int arraywidth = (int)((initialcounts::number_freq_items-1)/8) + 1;
  int rowcounter = 0;
  do {
    next_transaction = new char[arraywidth];
    for ( int column = 0; column < arraywidth; column++ ) 
      next_transaction[column] = '\000';
    items_count = 0;
    do {
      fin.get (c);
      item = 0;
      pos = 0;
      while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) )
      {
        item = 10*item + (int)(c) -(int)('0');
        pos++;
        fin.get (c);
      }//while
      if ( pos && initialcounts::ranking[item-initialcounts::min_itemnr] >= 0 )
      {	
	newitemnr = initialcounts::ranking[item-initialcounts::min_itemnr];
	next_transaction[newitemnr >> 3] |= supervector[newitemnr];
        items_count++;
      }//if
    } while ( c != '\n' && ! fin.eof ( ) );
    if ( items_count >= 2 )   
      dataset[rowcounter++] = next_transaction;
    else 
      delete [ ] next_transaction;
  } while( ! fin.eof ( ) );
  fin.close ( );
}//data_array::inputread

// constructor
trie::trie (initialcounts & initialdata, data_array & datagrid)
{
  triesize = initialcounts::number_freq_items;
  root = (bucket*) calloc (triesize, sizeof (bucket));
  for ( int itemnr = 0; itemnr < triesize; itemnr++ )
  { 
    root[itemnr].count = minsup;  // to avoid problems with short's
    root[itemnr].itemvalue = itemnr;
    root[itemnr].number_followers = 0;
    root[itemnr].next = NULL;
  }//for
}//trie::trie

// build trie out of transactions
void trie::build_up ( )
{
  int upperbound = initialcounts::number_freq_items-2;
  bucket *root2;
  short numberfol;
  for ( int k = upperbound; k >= 0; k-- )
  {
    extend (k);
    root2 = root[k].next;
    numberfol = root[k].number_followers;
    for ( therow = 0; therow < initialcounts::number_transactions; therow++ ) 
    {
      globpointer = dataset[therow];
      if ( inspect (k) )
	count (root2, numberfol);
    }//for
    if ( numberfol > 0 )
      cleanup (root[k].next, root[k].number_followers);
  }//for
}//trie::build_up

// extend trie at position k
void trie::extend (int k)
{
  root[k].next = (bucket*) calloc (triesize-k-1, sizeof (bucket));
  root[k].number_followers = triesize-k-1;
  copying (root[k].next, root+k+1, triesize-1-k);
}//trie::extend

// push contents of buckets to the beginning of the array,
// leaving number of them (call by reference parameter!)
void trie::cleanup (bucket * & trienode, short & number)
{
  short i, j = 0; 
  for ( i = 0; i < number; i++ ) {
    if ( trienode[i].count >= minsup ) {
      trienode[j] = trienode[i];
      if ( trienode[j].number_followers > 0 )
        cleanup (trienode[j].next, trienode[j].number_followers);
      j++;
    }//if
  }//for
  // memory usage optimizer:
//  trienode = (bucket*) realloc (trienode, j*sizeof (bucket));
  number = j;  // call by reference parameter!
}//trie::cleanup

// copy trie structure from q to p
void trie::copying (struct bucket *p, struct bucket *q, int number_q_buckets)
{
  short temp;
  for ( int source = 0; source < number_q_buckets; source++ )
  {
    p->count = 0;
    p->itemvalue = q->itemvalue;
    p->number_followers = temp = q->number_followers;
    if ( temp > 0 )
    {
      p->next = (bucket*) calloc (temp, sizeof (bucket));
      copying (p->next, q->next, temp);
    }//if
    else
      p->next = NULL;
    p++;  // pointer arithmetic
    q++;
  }//for
}//trie::copying 

// do the counting: process current transaction recursively through trienode
void count (bucket *trienode, short number_buckets)
{ 
  for ( short i = 0; i < number_buckets; i++ )
  {
    if ( inspect (trienode->itemvalue) )
    {
      trienode->count++;
      if ( trienode->number_followers > 0 )
        count (trienode->next, trienode->number_followers);
    }//if
    trienode++;  // pointer arithmetic
  }//for
}//count

// print resulting trie and frequency of each pattern length
void trie::printtrie (char *outputdata)
{
  int k;
  for ( k = 0; k < MAXDEPTH; k++ ) 
    length_count[k] = 0;
  outfilename = fopen (outputdata, "w");
  fprintf (outfilename, "(%d)\n", initialcounts::lines);
  printout (1, root, triesize);
  int lpl;
  for ( lpl = MAXDEPTH-1; lpl >= 0 && length_count[lpl] == 0; lpl-- )
    ;
  if ( initialcounts::lines >= minsup )
    printf ("1\n");
  for ( k = 1; k <= lpl; k++ )  
    printf ("%d\n", length_count[k]);
  fclose(outfilename);
}//trie::printtrie

// do the printing
void trie::printout (int depth, struct bucket *trienode, int number_buckets)
{
  for ( int i = 0; i < number_buckets; i++ )
  {
    results[depth] = trienode[i].itemvalue;
    length_count[depth]++;
    for ( int j = 1; j <= depth; j++ )
      fprintf (outfilename, "%d ", initialcounts::itemsorder[results[j]]);
    if ( depth > 1 )
      fprintf (outfilename, "(%d)\n", trienode[i].count);
    else
      fprintf (outfilename, "(%d)\n", initialcounts::items_frequency[i]);
    if ( trienode[i].number_followers > 0 )
      printout (depth+1, trienode[i].next, trienode[i].number_followers);
  }//for
}//trie::printout

// main program
int main (int argc, char *argv[ ])
{
//   long int time1, time2, c;
//   time1 = time (&c);
   minsup = macro_minsup;
//   printf ("\nReading input starts ...\n");
   initialcounts initialdata (input_filename);
   data_array datagrid (input_filename);
//   printf ("Reading input finished\n");
//   time2 = time (&c);
//   printf ("Execution time so far: %ld\n", time2-time1);

//   printf ("Counting starts ...\n");
//   time1 = time (&c);
   trie ourtrie (initialdata,datagrid);
   ourtrie.build_up ( );
//   time2 = time (&c);
//   printf ("Execution time - counting: %ld\n\n", time2-time1);
   if ( argc > 3 )
     ourtrie.printtrie (output_filename);
   return 0;
}//main


