
//
// C++ program that finds frequent itemsets - file current.cc
// April 23, 2004
// Wim Pijls and Walter Kosters
// Erasmus University Rotterdam and Leiden University, The Netherlands
// pijls@few.eur.nl and kosters@liacs.nl
// http://www.liacs.nl/home/kosters/df/
//
// makefile: g++ -Wall -O3 -o fim_all current.cc
//
// Assume that the FP-database and the trie containing all frequent
// sets together fit in main memory
// 
// The data is read four times:
// 1. to find minimal and maximal item number that occurs
// 2. to determine the frequencies of the single item numbers
// 3. to find the relevant transactions (containing at least 2 frequent items)
// 4. to store the (relevant) database in main memory
//
// Several (time) printing commands are commented out with //
//
// This version (current.cc) replaces previous versions dfmemory.cc
// and dftime.cc; main difference: memory management is improved
// in order to avoid unnecessary allocations, which makes different
// memory/time efficient versions obsolete.
// Runtimes are comparable with those of its predecessors
// Difference with dffast.cc: FP-trees for database; two types distinguished
// (int and unsigned short) during runtime using templates
//

// name inputfile:
#define input_filename argv[1]
// (absolute) value of minsup:
#define macro_minsup atoi(argv[2])
// name outputfile:
#define output_filename argv[3]

#include <iostream>
#include <fstream>
#include <cstdio>
#include <climits>
#include <cstdlib>
#include <ctime>
using namespace std;

const int MAXDEPTH = 100; // maximal depth of trie (for printing)

int minsup;               // minimum support

//========================================================================
//
// DATABASE CONSTRUCTION

class initialcounts
{
  public:
    initialcounts (char *inputdata);
    static int *itemsorder;
    static int *items_frequency;
    static int *ranking;
    static int min_itemnr, max_itemnr;
    static int number_transactions, number_freq_items, lines, last;
  private:
    void first_pass ( );
    void second_pass ( );
    void third_pass ( );
    void initialsort ( );
    char *infilename;
    int *init_items_frequency;
};

int *initialcounts::items_frequency = NULL;
int *initialcounts::ranking = NULL;
int *initialcounts::itemsorder = NULL;
int initialcounts::number_transactions = 0;
int initialcounts::lines = 0;
int initialcounts::number_freq_items = 0;
int initialcounts::min_itemnr = 0;
int initialcounts::max_itemnr = 0;
int initialcounts::last = 0;

// constructor
initialcounts::initialcounts (char *inputdata)
{
  infilename = inputdata;
  first_pass ( );
  second_pass ( );
  third_pass ( );
  initialsort ( );
}//initialcounts::initialcounts

// computes minimal and maximal item number that occur in the database;
// if these are known in advance, this function can be easily adapted
// function reads whole file!
void initialcounts::first_pass ( )
{
  int itemnr;
  bool first = true;
  ifstream fin (infilename);
  if ( ! fin ) 
    cout << "No such filename" << endl;
  char c; 
  int pos;
  do 
  {
    do 
    {
      fin.get (c);
      itemnr = 0;
      pos = 0;
      while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) )
      {
        itemnr = 10*itemnr + (int)(c) - (int)('0');
        pos++;
        fin.get (c);
      }//while
      if ( pos ) 
      {
	if ( first )
          max_itemnr = min_itemnr = itemnr;
	first = false;
        if ( itemnr < min_itemnr )
          min_itemnr = itemnr;
        else if ( itemnr > max_itemnr )
          max_itemnr = itemnr;
      }//if
    } while ( c != '\n' && ! fin.eof ( ) );
  } while ( ! fin.eof ( ) );
  fin.close ( );
}//initialcounts::first_pass

// determines frequency for all items, and number of frequent items
// function reads whole file!
void initialcounts::second_pass ( )
{
  int k;
  ifstream fin (infilename);
  if ( ! fin )
    cout << "No such filename" << endl;
  int itemrange = max_itemnr-min_itemnr+1;
  init_items_frequency = new int[itemrange];
  for ( k = 0; k < itemrange; k++ ) 
    init_items_frequency[k] = 0;
  char c; 
  int item, pos;
  do 
  {
    do
    {
      fin.get (c);
      item = 0;
      pos = 0;
      while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) )
      {
        item = 10*item + (int)(c) - (int)('0');
        pos++;
        fin.get (c);
      }//while
      if ( pos )
        init_items_frequency[item-min_itemnr]++;
    } while ( c != '\n' && ! fin.eof ( ) );
  } while ( ! fin.eof ( ) );
  fin.close ( );
  for ( k = 0; k < itemrange; k++ )
    if ( init_items_frequency[k] >= minsup )
      number_freq_items++;
  printf ("Number of frequent items: %d\n", number_freq_items);
}//initialcounts::second_pass

// determines number of relevant transactions, i.e., those
// that contain at least two frequent items
// (those that contain one frequent item have already been accounted for
// while determining the frequency of the single items)
// 20.2.2007: changed to all transactions that contain some frequent item 
// the variable is never used in the program
// function reads whole file!
void initialcounts::third_pass ( )
{
  number_transactions = 0;
  ifstream fin (infilename);
  if ( ! fin ) 
    cout << "No such filename" << endl;
  char c;
  int item, pos, items_in_trans;
  bool line;
  do 
  {  
    line = false;
    items_in_trans = 0;
    do 
    {
      fin.get (c);
      item = 0;
      pos = 0;
      while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) )
      {
        item = 10*item + (int)(c) - (int)('0');
        pos++;
        line = true;
        fin.get (c);
      }//while
      if ( pos && init_items_frequency[item-min_itemnr] >= minsup )
        items_in_trans++;
    } while ( c != '\n' && ! fin.eof ( ) );
    if ( line ) 
      lines++;
    if ( items_in_trans >= 1 ) 
      number_transactions++;
  } while ( ! fin.eof ( ) );
  printf ("Number of relevant transactions: %d\n", number_transactions);
  printf ("Number of lines: %d\n", lines);
  fin.close ( );
}//initialcounts::third_pass

// sort items with respect to support - and renumber
void initialcounts::initialsort ( )
{
  int left_cursor = 0;
  int right_cursor = max_itemnr-min_itemnr;
  int itemrange = right_cursor+1;
  int *items_numbers;
  int i, j, k;
  items_numbers = new int[number_freq_items];
  for ( k = 0; k < number_freq_items; k++ )
    items_numbers[k] = k;
  while ( left_cursor < right_cursor )
  {
    while ( init_items_frequency[left_cursor] >= minsup 
	   && left_cursor<number_freq_items ) 
      left_cursor++;
    while ( init_items_frequency[right_cursor] < minsup ) 
      right_cursor--;
    if ( left_cursor < right_cursor )
    {
      init_items_frequency[left_cursor] = init_items_frequency[right_cursor];
      items_numbers[left_cursor] = right_cursor;
    }//if
    right_cursor--;
  }//while
  ranking = new int[itemrange];
  for ( k = 0; k < itemrange; k++ ) 
    ranking[k] = -1;
  itemsorder = new int[number_freq_items];
  items_frequency = new int[number_freq_items];
  int maxfrequency, bestitem = 0, bestindex = 0;
  int *used;
  used = new int[number_freq_items];
  for ( i = 0; i < number_freq_items; i++ ) 
    used[i] = 0;
  for ( int rank = number_freq_items-1; rank >= 0; rank-- )
  {
    maxfrequency = -1;
    for ( j = 0; j < number_freq_items; j++ )
      if ( used[j] == 0 && init_items_frequency[j] > maxfrequency )
      { 
	bestindex = j;
        bestitem = items_numbers[j];
        maxfrequency = init_items_frequency[j];
      }//if
    itemsorder[rank] = bestitem+min_itemnr;
    ranking[bestitem] = rank;
    items_frequency[rank] = maxfrequency;
    used[bestindex] = 1;
  }//for
}//initialcounts::initialsort

//=========================================================================
//
// FP BUILDING

template <class T>
struct FPtreenode {
  unsigned short info;
  T count;
  unsigned short mark;
//  unsigned short length;
//  unsigned short depth;
  FPtreenode<T> *child;
  FPtreenode<T> *brother;
  FPtreenode<T> *nextsame;
  FPtreenode<T> *father;
};

template <class T>
class FP
{
  public:
    FPtreenode<T> *FProot;
    FPtreenode<T> **layer;
    FP ( ) { FProot = NULL; }
    void buildFP (char *infilename);
  private:
    void updateFPtree (bool *next_transaction);
};

// push next_transaction into FP-tree
template <class T>
void FP<T>::updateFPtree (bool *next_transaction) 
{
  int art;
//  unsigned short localdepth = 0;
  FPtreenode<T> *ptr = FProot;
  FPtreenode<T> *kid;
  FPtreenode<T> *prev = NULL;
  for ( art = initialcounts::number_freq_items-1; art >= 0; art-- )
    if ( next_transaction[art] ) 
    {
      kid = ptr->child;
//      localdepth++;
      if ( kid ) 
      {
        while ( kid && kid->info != art ) 
	{
	  prev = kid;
	  kid = kid->brother;
	}//while
        if ( kid ) 
        {
 	  kid->count++;
	  ptr = kid;
        }//if
        else
        {
          prev->brother = new FPtreenode<T>;
	  if ( ptr == FProot )
	    prev->brother->father = NULL;
	  else
	    prev->brother->father = ptr;
	  ptr = prev->brother;
          ptr->count = 1;
//	  ptr->depth = localdepth;
	  ptr->mark = ( ptr->info = art ) + 1;
	  ptr->child = ptr->brother = NULL;
	  ptr->nextsame = layer[art];
	  layer[art] = ptr;
        }//else
      }//if
      else
      {
        ptr->child = new FPtreenode<T>;
	if ( ptr == FProot )
	  ptr->child->father = NULL;
	else
	  ptr->child->father = ptr;
	ptr = ptr->child;
	ptr->count = 1;
//	ptr->depth = localdepth;
	ptr->mark = ( ptr->info = art ) + 1;
	ptr->child = ptr->brother = NULL;
	ptr->nextsame = layer[art];
	layer[art] = ptr;
      }//else
    }//if
}//FP::updateFPtree

// reads the entire datafile from file inputdata and
// puts it into an FP-tree
// reads whole file (for the fourth time)!
template <class T>
void FP<T>::buildFP (char *infilename)
{  
  int pos, item, items_count, i;
  unsigned short newitemnr;
  char c;
  bool *next_transaction = new bool[initialcounts::number_freq_items];
  layer = new FPtreenode<T>*[initialcounts::number_freq_items];
  for ( i = 0; i < initialcounts::number_freq_items; i++ )
    layer[i] = NULL;
  FProot = new FPtreenode<T>;
  FProot->info = 0;
  FProot->child = FProot->brother = FProot->father = FProot->nextsame = NULL;
  FProot->count = 0;
//  FProot->depth = 0;
  ifstream fin (infilename);
  if ( ! fin ) 
    cout << "No such filename" << endl; 
  do 
  {
    for ( int column = 0; column < initialcounts::number_freq_items; column++ ) 
      next_transaction[column] = false;
    items_count = 0;
    do 
    {
      fin.get (c);
      item = 0;
      pos = 0;
      while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) )
      {
        item = 10*item + (int)(c) -(int)('0');
        pos++;
        fin.get (c);
      }//while
      if ( pos && initialcounts::ranking[item-initialcounts::min_itemnr] >= 0 )
      {	
	newitemnr = initialcounts::ranking[item-initialcounts::min_itemnr];
	next_transaction[newitemnr] = true;
        items_count++;
      }//if
    } while ( c != '\n' && ! fin.eof ( ) );
    if ( items_count >= 1 ) // perhaps 2, but does it really matter?
      updateFPtree (next_transaction);
  } while( ! fin.eof ( ) );
  fin.close ( );
  delete [ ] next_transaction;
}//FP::buildFP

//=========================================================================
//
// COUNTING

template <class T>
struct bucket
{
  unsigned short itemvalue;
  T count;
  T aux;
  unsigned short number_followers;
  bucket *next;
};

template <class T>
class trie
{
  public:
    trie ( ) { };
    trie (initialcounts & initialdata);
    void build_up (FP<T> & theoriginaltree);
    void printtrie (char *outputdata);
  private:
    int length_count[MAXDEPTH];
    bool *frequent;
    void fpcount3 (unsigned short *nodes,
		   bucket<T> *trienode, unsigned short number_buckets);
    void makeaux0 (bucket<T> *root, unsigned short number);
    FILE *outfilename;
    struct bucket<T> *root;
    unsigned short triesize;
    unsigned short k;
    T thevalue;
    unsigned short *thearraypointer;
    int results[MAXDEPTH];
    unsigned short *follows;
    struct bucket<T> **roots;
    void copying (bucket<T> *p, bucket<T> *q, unsigned short number_q_buckets);
    void printout (int depth, bucket<T> *trienode, unsigned short number_buckets);
    void dotheFPtree (FPtreenode<T> *itsroot);
};

// constructor
template <class T>
trie<T>::trie (initialcounts & initialdata)
{
  triesize = initialcounts::number_freq_items;
  root = new bucket<T>[triesize];
  frequent = new bool[triesize];
  thearraypointer = new unsigned short[triesize+1];
  *thearraypointer = 0;
  thearraypointer++;
  follows = new unsigned short[triesize];
  roots = new bucket<T>*[triesize];
  for ( unsigned short itemnr = 0; itemnr < triesize; itemnr++ )
  { 
    root[itemnr].count = 666;  
    // to avoid problems with short's;
    // this particular count-field is never used anymore (I hope)
    root[itemnr].itemvalue = itemnr;
    root[itemnr].number_followers = 0;
    roots[itemnr] = root[itemnr].next = NULL;
    follows[itemnr] = 0;
  }//for
}//trie::trie

// do the FP-counting, already knowing the relevant frequent 2-itemsets
// now the deeper levels
// number_buckets > 0, nodes contains a sentinel 0 at the start
template <class T>
void trie<T>::fpcount3 (unsigned short *nodes,
		        bucket<T> *trienode, unsigned short number_buckets) 
{
  unsigned short x = trienode->itemvalue;
  do
  {
    while ( *nodes != x )
      if ( *nodes > x )
      {
        trienode++;
        if ( ! ( --number_buckets ) )
          return;
	x = trienode->itemvalue;
      }//if
      else
	if ( ! *--nodes )
	  return;
    trienode->aux += thevalue;
    if ( ! *--nodes ) 
      return;
    if ( trienode->number_followers )
      fpcount3 (nodes, trienode->next, trienode->number_followers);
    trienode++;
    if ( ! ( --number_buckets ) )
      return;
    x = trienode->itemvalue;
  } while ( true );
}//trie::fpcount3

// traverse the FP-tree
template <class T>
void trie<T>::dotheFPtree (FPtreenode<T> *itsroot)
{
  itsroot = itsroot->child;
  while ( itsroot )
  {
    if ( itsroot->mark == k )
    {
      if ( frequent[*thearraypointer = itsroot->info] )
      {
	if ( follows[*thearraypointer] )
	{
	  thevalue = itsroot->count;
	  fpcount3 (thearraypointer, roots[*thearraypointer], 
		    follows[*thearraypointer]);
	}//if
	thearraypointer++;
	dotheFPtree (itsroot);
	thearraypointer--;
      }//if
      else
        dotheFPtree (itsroot);
    }//if
    itsroot = itsroot->brother;
  }//while
}//trie::dotheFPtree

// build trie out of FP-transactions
template <class T>
void trie<T>::build_up (FP<T> & theoriginaltree)
{
  FPtreenode<T> *goingup;
  FPtreenode<T> *globpointer;
  T *counttwoitemsets = new T[triesize];
  int cnt, i, suppo;
  k = triesize - 2;
  while ( triesize > 1 )
  {
    globpointer = theoriginaltree.layer[k];
    for ( i = k+1; i < triesize; i++ )
      counttwoitemsets[i] = 0;
    while ( goingup = globpointer )  // note single =!
    {
      suppo = globpointer->count;
      while ( ( goingup = goingup->father ) && goingup->mark > k ) 
      {
        counttwoitemsets[goingup->info] += suppo;
        goingup->count = suppo;
	goingup->mark = k;
      }//while
      while ( goingup ) 
      {
        counttwoitemsets[goingup->info] += suppo;
        goingup->count += suppo;
        goingup = goingup->father;
      }//while
      globpointer = globpointer->nextsame;
    }//while
    cnt = 0;
    for ( i = triesize-1; i >= k+1; i-- )  // in this order!
      if ( counttwoitemsets[i] < minsup )
      {
        root[i].aux = 0;
        frequent[i] = false;
      }//if
      else 
      {	      
        root[i].aux = counttwoitemsets[i];
        frequent[i] = true;
	makeaux0 (roots[i], follows[i]);
	cnt++;
      }//else
    if ( cnt ) 
    {
      dotheFPtree (theoriginaltree.FProot);
      roots[k] = root[k].next = new bucket<T>[cnt];
      follows[k] = root[k].number_followers = cnt;
      copying (roots[k], root+k+1, triesize-k-1);
    }//if
    if ( k == 0 )
      break;
    k--;
  }//while
}//trie::build_up

// make all necessary aux-fields 0
template <class T>
void trie<T>::makeaux0 (bucket<T> *root, unsigned short number) 
{
  for ( int j = 0; j < number; j++ ) 
  {
    root->aux = 0;
    if ( root->number_followers && frequent[root->itemvalue] )
      makeaux0 (root->next, root->number_followers);
    root++;
  }//for
}//trie::makeaux0

// copy trie structure from q to p
template <class T>
void trie<T>::copying (bucket<T> * p, bucket<T> *q, 
		       unsigned short number_q_buckets)
{
  short temp;  // how many buckets does p need?
  short i;
  for ( int source = 0; source < number_q_buckets; source++ )
  {
    if ( q->aux >= minsup )
    {
      p->count = q->aux;
      p->itemvalue = q->itemvalue;
      temp = 0;
      for ( i = q->number_followers-1; i >= 0; i-- )
        if ( q->next[i].aux >= minsup )
	  temp++;
      p->number_followers = temp;
      if ( temp > 0 )
      {
        p->next = new bucket<T>[temp];
        copying (p->next, q->next, q->number_followers);
      }//if
      else
        p->next = NULL;
      p++;
    }//if
    q++;
  }//for
}//trie::copying 

// print resulting trie and frequency of each pattern length
template <class T>
void trie<T>::printtrie (char *outputdata)
{
  int k;
  for ( k = 0; k < MAXDEPTH; k++ ) 
    length_count[k] = 0;
  outfilename = fopen (outputdata, "w");
  fprintf (outfilename, "(%d)\n", initialcounts::lines);  // empty set
  printout (1, root, triesize);
  int lpl;
  for ( lpl = MAXDEPTH-1; lpl >= 0 && length_count[lpl] == 0; lpl-- )
    ;
  if ( initialcounts::lines >= minsup )
    printf ("1\n");
  for ( k = 1; k <= lpl; k++ )  
    printf ("%d\n", length_count[k]);
  fclose(outfilename);
}//trie::printtrie

// do the printing
template <class T>
void trie<T>::printout (int depth, bucket<T> *trienode, 
		        unsigned short number_buckets)
{
  for ( int i = 0; i < number_buckets; i++ )
  {
    results[depth] = trienode[i].itemvalue;
    length_count[depth]++;
    for ( int j = 1; j <= depth; j++ )
      fprintf (outfilename, "%d ", initialcounts::itemsorder[results[j]]);
    if ( depth > 1 )
      fprintf (outfilename, "(%d)\n", trienode[i].count);
    else
      fprintf (outfilename, "(%d)\n", initialcounts::items_frequency[i]);
    if ( trienode[i].number_followers > 0 )
      printout (depth+1, trienode[i].next, trienode[i].number_followers);
  }//for
}//trie::printout

// main program
int main (int argc, char *argv[ ])
{
   long int time1, time2, c;
   minsup = macro_minsup;
   if ( minsup <= 0 ) minsup = 1;
   time1 = time (&c);
   printf ("\nReading input starts ...\n");
   initialcounts initialdata (input_filename);
   time2 = time (&c);
   printf ("Execution time - reading: %ld\n", time2-time1);
  
   if ( initialcounts::items_frequency[initialcounts::number_freq_items-1] 
	>= USHRT_MAX ) 
   {
     time1 = time (&c);
     printf ("Building (big) FP-tree starts ...\n");
     FP<int> theFPtree;
     theFPtree.buildFP (input_filename);
     printf ("FP-tree completed\n");
     time2 = time (&c);
     printf ("Execution time - FP-phase: %ld\n", time2-time1);
     time1 = time (&c);
     printf ("Counting (big) starts ...\n");
     trie<int> ourbigtrie (initialdata);
     ourbigtrie.build_up (theFPtree);
     time2 = time (&c);
     printf ("Execution time - counting: %ld\n\n", time2-time1);
     if ( argc > 3 )
       ourbigtrie.printtrie (output_filename);
   }//if
   else {
     time1 = time (&c);
     printf ("Building (small) FP-tree starts ...\n");
     FP<unsigned short> theFPtree;
     theFPtree.buildFP (input_filename);
     printf ("FP-tree completed\n");
     time2 = time (&c);
     printf ("Execution time - FP-phase: %ld\n", time2-time1);
     time1 = time (&c);
     printf ("Counting (small) starts ...\n");
     trie<unsigned short> oursmalltrie (initialdata);
     oursmalltrie.build_up (theFPtree);
     time2 = time (&c);
     printf ("Execution time - counting: %ld\n\n", time2-time1);
     if ( argc > 3 )
       oursmalltrie.printtrie (output_filename);
   }//else
   
   return 0;
}//main


