next up previous contents
Next: Some Tools for BN Up: Integration of Data Mining Previous: The Bayesian Network Used

Integrated System Source Code

 


//-------------------------------------------------------------------------
// CBR-DM Integration
// A prototype to show a possible implementation-method.
//
// Torgeir Dingsoyr 1997/1998
//
// History:
// 	V0: 26.12.97 - Shell
//	V1: 02.02.98 - Loads casebase
//	V2: 04.02.98 - Case management ok, prints cases.
//	V3: 05.02.98 - Simple similarity metric.
//	V4: 06.02.98 - Uses data from bayesian network in computation of
//		       similarity metric.
//	V5: 08.02.98 - Case management from memory.
//	V6: 08.02.98 - Speed-up, bugs fixed.
//
// Input files:
// DBFILE	- tab-separated casebase. First line is feature name,
//		  second feature type ([I]nteger or [S]ymbol), third line
//		  is feature range. NUMCASES cases follow.
// BAYESFILE	- Bayesian network file in the following format:
//		  Prob<Tab>DestFeat<Tab>FeatureVal<Tab>...
//		  Where Prob is P(DestFeat|FeatureValues).
//		  Feature values without influence is labeles "NA:
//		  The file consists of NUMBAYES lines. The order of the
//		  features must be the same as in the casebase.
// QUESTFILE	- A tab-separated file containing "new cases" that will
//		  be compared to all cases in DBFILE.
//
// Output file:
// OUTFILE	- A tab-separated file containing all features, and the
//		  case number for the NUMRET most similar cases when
//		  comparing the cases in DBFILE to the cases in QUESTFILE.
//-------------------------------------------------------------------------

#include <stdio.h>
#include <iostream.h>
#include <fstream.h>
#include <math.h>
#include <string.h>

#define NUMCHAR 20		// Number of characters per feature.
#define NUMFEAT 8		// Number of features in casebase
#define NUMCASES 4646		// Number of cases in casebase.
#define NUMRET 1		// Cases to retrieve.
#define NUMBAYES 182		// Lines of Bayes data structure
#define TRUE 1
#define FALSE 0
#define OFFSET 2		// 2 lines with info before data in casebase-file
#define QUESTCASE 45		// # of "new cases" to match with casebase
#define DBFILE "recomp2.db"
#define BAYESFILE "recomp2.bn"
#define OUTFILE "results.txt"
#define QUESTFILE "newcases.txt"
#define DBSIZE 220632		// Size of casebase file.
#define BAYSIZE 7060		// Size of Bayesian network file.

//-------------------------------------------------------------------------
// CASE structure. (Simple)
//-------------------------------------------------------------------------

struct acase
{
char Feature[NUMFEAT+1][NUMCHAR];
int CaseNum;
};

struct casebase
{
acase description;	// Feature name
acase type;             // Feature type: I-Integer, S-Symbol
acase range;		// Feature range for integer (0-"range value")
int   weights[NUMFEAT];	// Feature weight
acase curcase;		// Current case values
};

//-------------------------------------------------------------------------
// Bayesian network structure
//-------------------------------------------------------------------------

struct aprob
{
int Probability;     // States P(Destfeat|case)
int DestFeature;
char Feature[NUMFEAT+3][NUMCHAR];
};

//-------------------------------------------------------------------------
// Functions used
//-------------------------------------------------------------------------
acase GetCaseD(int cnumber);
void PrintCase(casebase cases);
acase InputCase(casebase cases);
double SimMetric(acase casex, acase casey, casebase cb);
double f(int i, acase casex, acase casey, casebase cb);
void InsertB(int j, double sim, int i);
void Retrieve(casebase cb, acase newcase);
double FindBayesProb(int i, acase casey, acase casex, casebase cb);
double SearchBayesD(acase curcase, int l);
double SearchBayes(acase curcase, int l);
void LoadBayes();
int Compare(aprob tempprob, acase curcase);
int chartoint(char string[NUMCHAR]);
acase GetNextCase(int cnumber);
void SaveCase(casebase thecase);
acase GetCase(int cnumber);
void GetCaseBase();
void RetrieveAll(casebase cb, acase newcase);

//-------------------------------------------------------------------------
// Global variables
//-------------------------------------------------------------------------

	double		Simbuff[NUMRET];
	int		Casebuff[NUMRET];
	char huge	DataFile[DBSIZE];
	char 		BayesFile[BAYSIZE];
	long 		CaseOffset[NUMCASES+OFFSET];
	long 		BayesOffset[NUMBAYES];

//-------------------------------------------------------------------------
// Main
//-------------------------------------------------------------------------
main()
{
	cout << endl << "Integrated CBR-DM System. 1998 TD" << endl;

	casebase caseb;
	caseb.description = GetCaseD(-2);
	caseb.type = GetCaseD(-1);
	caseb.range = GetCaseD(0);
	LoadBayes();
//	acase incase = InputCase(caseb);

// Get a new case from question database
	for (int c=0;c<QUESTCASE;c++)
		{
		cout << c << endl;
		acase incase = GetNextCase(c);
		RetrieveAll(caseb, incase);
//		SaveCase(incase);
// Save NUMRET best matches.
		for (int n=0;n<NUMRET;n++)
			{
			caseb.curcase = GetCaseD(Casebuff[n]);
			SaveCase(caseb);
			}
		}
}

//-------------------------------------------------------------------------
// Prints the case to standard output.
// V1. 04.02.98 TD.
//-------------------------------------------------------------------------
void PrintCase(casebase cases)
{
	cout << "CASE number" << cases.curcase.CaseNum << endl;
	for (int i=0;i<NUMFEAT+1;i++)
	{
	cout << cases.description.Feature[i] << '\t' << '\t';
	cout << cases.curcase.Feature[i];
	cout << endl << endl;
	}
}

//-------------------------------------------------------------------------
// Saves the case to OUTFILE.
// V1. 07.02.98 TD.
// V2. 09.02.98 TD. Single line feature output.
//-------------------------------------------------------------------------
void SaveCase(casebase thecase)
{
	ofstream outFile(OUTFILE,ios::app);

	if (!outFile)
		cout << "File output error";

	for (int i=0;i<NUMFEAT+1;i++)
	{
	outFile << thecase.curcase.Feature[i] << "\t";
	}
	outFile << thecase.curcase.CaseNum << endl;
}

//-------------------------------------------------------------------------
// Get a case from disk.
// V1. 04.02.98 TD. Extremely inefficient and time-consuming.
// V2. 06.02.98 TD. Bug removed. Offset introduced.
//-------------------------------------------------------------------------
acase GetCaseD(int cnumber)
{
	cnumber = cnumber+OFFSET;	// Offset for case description in file.
	acase tempcase;
	tempcase.CaseNum = cnumber;
	ifstream inFile(DBFILE, ios::in);

	if (!inFile)
        	cout << "File error.";

	for (int k=0;k<(cnumber+1);k++)
	{
	for (int j=0; j<(NUMFEAT+1); j++)
		{
			char ch;
			int i=0;
			while ( inFile.get(ch) && (ch != '\t' && ch != '\n'))
			{
				// cout << ch;
				tempcase.Feature[j][i] = ch;
				i++;
			}
			tempcase.Feature[j][i]=0;
		}
	}
	return tempcase;
}

//-------------------------------------------------------------------------
// Get a casebase from disk.
// V1. 08.02.98 TD.
//-------------------------------------------------------------------------
void GetCaseBase()
{
	ifstream inFile(DBFILE, ios::in);

	if (!inFile)
		cout << "File error.";

	CaseOffset[0]=0;
	int l=1;
	char ch;
	for (long k=0;l<NUMCASES;k++)
	{
		inFile.get(ch);
		DataFile[k]=ch;
		if (ch=='\n')
			{
			CaseOffset[l]=k+1;
			l++;
			}
	}
}

//-------------------------------------------------------------------------
// Get a case from memory.
// V1. 08.02.98 TD.
//-------------------------------------------------------------------------
acase GetCase(int cnumber)
{
	cnumber = cnumber+OFFSET;	// Offset for case description in file.
	acase tempcase;
	tempcase.CaseNum = cnumber;

	long k = CaseOffset[cnumber];
	int l = 0;

	for (int j=0; j<(NUMFEAT+1); j++)
		{
			char ch;
			int i=0;
			ch=DataFile[k+l];
			while (ch != '\t' && ch != '\n')
			{
				// cout << ch;
				tempcase.Feature[j][i] = ch;
				i++;
				l++;
				ch=DataFile[k+l];
			}
			l++;
			tempcase.Feature[j][i]=0;
		}
	return tempcase;
}


//-------------------------------------------------------------------------
// Get a new case from disk. ("question")
// V1. 07.02.98 TD.
//-------------------------------------------------------------------------
acase GetNextCase(int cnumber)
{
	acase tempcase;
	ifstream inFile(QUESTFILE, ios::in);

	if (!inFile)
		cout << "File error.";

	for (int k=0;k<(cnumber+1);k++)
	{
	for (int j=0; j<(NUMFEAT+1); j++)
		{
			char ch;
			int i=0;
			while ( inFile.get(ch) && (ch != '\t' && ch != '\n'))
			{
				// cout << ch;
				tempcase.Feature[j][i] = ch;
				i++;
			}
			tempcase.Feature[j][i]=0;
		}
	}
	return tempcase;
}

//-------------------------------------------------------------------------
// Input a new case from user.
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
acase InputCase(casebase cases)
{
	acase Newcase;
	cout << "Input new case";
	for (int i=0;i<NUMFEAT+1;i++)
	{
	cout << cases.description.Feature[i] << " = ";
	cin >> Newcase.Feature[i];
	}
	return Newcase;
}

//-------------------------------------------------------------------------
// Retrieve the NUMRET most similar cases.
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
void RetrieveAll(casebase cb, acase newcase)
{
	acase tempcase;
	double	sim;
	int ins;

	for (int i=0;i<NUMRET;i++)
	Simbuff[i] = Casebuff[i] = 0;

	ifstream inFile(DBFILE, ios::in);

	if (!inFile)
		cout << "File error.";

	for (int k=0;k<(NUMCASES+1);k++)
	{
		for (int j=0; j<(NUMFEAT+1); j++)
		{
			char ch;
			int i=0;
			while ( inFile.get(ch) && (ch != '\t' && ch != '\n'))
			{
				// cout << ch;
				tempcase.Feature[j][i] = ch;
				i++;
			}
			tempcase.Feature[j][i]=0;
		}

	if (k % 100 == 0) cout << "\t" << k << endl;

		cb.curcase = tempcase;
		sim = SimMetric(newcase, cb.curcase,cb);
		int l=0;
		ins = FALSE;
		while (l<NUMRET && !ins && k> OFFSET)
			{
			if (sim >= Simbuff[l]) {
						InsertB(l, sim,k-OFFSET);
						ins=TRUE;
					      }
			l++;
			}
	}
}

//-------------------------------------------------------------------------
// Retrieve the NUMRET most similar cases.
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
void Retrieve(casebase cb, acase newcase)
{
	double	sim;
	int ins;

	// Clear buffers
	for (int i=0;i<NUMRET;i++)
		Simbuff[i] = Casebuff[i] = 0;

	for (int l=1;l<NUMCASES+1;l++)
	{
//		cout << l<< endl;
		cb.curcase = GetCaseD(l);
		sim = SimMetric(newcase, cb.curcase,cb);
		int j=0;
		ins = FALSE;
		while (j<NUMRET && !ins)
			{
			if (sim >= Simbuff[j]) {
						InsertB(j, sim,l);
						ins=TRUE;
					      }
			j++;
			}
	}
}

//-------------------------------------------------------------------------
// Insert case i with sim. metric sim into place j in retrieve buffer
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
void InsertB(int j, double sim, int i)
{
int k;

for (k=NUMRET-1;k>j;k--)
{
	Simbuff[k] = Simbuff[k-1];
	Casebuff[k] = Casebuff[k-1];
}
	Simbuff[k] = sim;
	Casebuff[k] = i;
}

//-------------------------------------------------------------------------
// Compute similarity metric.
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
double SimMetric(acase casex, acase casey, casebase cb)
{
	double sim = 0;
	for (int i=0;i<NUMFEAT+1;i++)
	{
		sim = sim + f(i, casex, casey, cb);
	}
	sim = sqrt(sim);
	return(sim);
}

//-------------------------------------------------------------------------
// Similarity function.
// V1. 05.02.98 TD.
// V2. 98.92.98 TD. Bug fixed.
//-------------------------------------------------------------------------
double f(int i, acase casex, acase casey, casebase cb)
{
//	if (cb.type.Feature[i][0] == 'I') // Integer
//	{
//		return(0.5);
//	}
//	else			// Symbol
//	{
	if (strcmp(casex.Feature[i],casey.Feature[i])==0)
		return(1);
	if (casex.Feature[i] != casey.Feature[i])
		{
			if (strcmp(casex.Feature[i],"NA")==0&&
			strcmp(casey.Feature[i],"NA")!=0)
			{
			return(FindBayesProb(i,casey,casex,cb));
			}
			else return(0);
		}
//	}
}

//-------------------------------------------------------------------------
// Find probability form a Bayes network. Ad-hoc version. Should have used
// DLLs from Microsoft Beleif Networks.
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
double FindBayesProb(int i,acase casey,acase casex,casebase cb)
{
	double probab;
	acase tempcase;
	tempcase = casex;
	strcpy(tempcase.Feature[i],casey.Feature[i]);
	probab = SearchBayes(tempcase,i);
	return(probab);
}

//-------------------------------------------------------------------------
// Search through a file produced from a Bayesian network for
// P(Feature_value|rest_of_features) when Feature_valus is missing (NA).
// V1. 06.02.98 TD.
//-------------------------------------------------------------------------
double SearchBayesD(acase curcase, int l)
{
	double	probability=0;
	double	maxprob;
	int	nummatch=0;
	int	maxmatch=0;
	aprob	tempprob;
	ifstream inFile(BAYESFILE, ios::in);

	if (!inFile)
		return(0);

	for (int k=0;k<NUMBAYES;k++)
	{

	for (int j=0; j<(NUMFEAT+3); j++)
		{
			char ch;
			for (int i=0;i<NUMCHAR+1;i++)
				tempprob.Feature[j][i]=0;
			char f[NUMCHAR];
			i=0;
			while ( inFile.get(ch) && (ch != '\t' && ch != '\n'))
			{
				// cout << ch;
				tempprob.Feature[j][i] = ch;
				i++;
			}
		}
		if (tempprob.Feature[1][0]=l&&
			strcmp(tempprob.Feature[l+OFFSET],curcase.Feature[l])==0)
		{
		tempprob.DestFeature= (int) tempprob.Feature[1][0];
		probability = chartoint(tempprob.Feature[0]);
		probability = probability/1000;
		nummatch = Compare(tempprob,curcase);
		if (nummatch > maxmatch) {
					  maxprob = probability;
					  maxmatch = nummatch;
					  }
		else if (nummatch == maxmatch && maxprob < probability
			&& nummatch >1)
					  {
					  maxprob = probability;
					  }
		}
	}
	return(maxprob);
}

//-------------------------------------------------------------------------
// Search through a file produced from a Bayesian network for
// P(Feature_value|rest_of_features) when Feature_valus is missing (NA).
// V1. 06.02.98 TD.
//-------------------------------------------------------------------------
void LoadBayes()
{
	ifstream inFile(BAYESFILE, ios::in);

//	if (!inFile)

	BayesOffset[0]=0;
	int l=1;
	char ch;
	for (long k=0;l<NUMBAYES;k++)
	{
		inFile.get(ch);
		BayesFile[k]=ch;
		if (ch=='\n')
			{
			BayesOffset[l]=k+1;
			l++;
			}
	}
}

//-------------------------------------------------------------------------
// Search through a Bayesian network in memory for
// P(Feature_value|rest_of_features) when Feature_valus is missing (NA).
// V1. 06.02.98 TD.
//-------------------------------------------------------------------------
double SearchBayes(acase curcase, int l)
{
	double	probability=0;
	double	maxprob=0;
	int	nummatch=0;
	int	maxmatch=0;
	aprob	tempprob;

	int m=-1;	// Points to location in bayes table.
	for (int k=0;k<NUMBAYES;k++)
	{
	for (int j=0; j<(NUMFEAT+3); j++)
		{
			char ch;
			for (int i=0;i<NUMCHAR+1;i++)
				tempprob.Feature[j][i]=0;
			char f[NUMCHAR];
			i=0;
                        m++;
			ch=BayesFile[m];
			while (ch != '\t' && ch != '\n')
			{
				tempprob.Feature[j][i] = ch;
				i++;
				m++;
				ch=BayesFile[m];
			}
		}
		if (tempprob.Feature[1][0]=l&&
			strcmp(tempprob.Feature[l+OFFSET],curcase.Feature[l])==0)
		{
		tempprob.DestFeature= (int) tempprob.Feature[1][0];
		probability = chartoint(tempprob.Feature[0]);
		probability = probability/1000;
		nummatch = Compare(tempprob,curcase);
		if (nummatch > maxmatch) {
					  maxprob = probability;
					  maxmatch = nummatch;
					  }
		else if (nummatch == maxmatch && maxprob < probability
			&& nummatch >1)
					  {
					  maxprob = probability;
					  }
		}
	}
	return(maxprob);
}

//-------------------------------------------------------------------------
// Find # of features that match.
// V1. 06.02.98 TD.
//-------------------------------------------------------------------------
int Compare(aprob tempprob, acase curcase)
{
int m=-1;	// must always match one.
for (int i=0;i<NUMFEAT;i++)
	{
	 if (strcmp(tempprob.Feature[i+OFFSET],curcase.Feature[i])==0 &&
	     strcmp(tempprob.Feature[i+OFFSET],"NA")!=0) m=m+1;
	}
return(m);
}

//-------------------------------------------------------------------------
// Convert a char* to integer.
// V1. 06.02.98 TD.
//-------------------------------------------------------------------------
int chartoint(char string[NUMCHAR])
{
	int number=0;
	int pot=1;
	int i=0;
	while (i<NUMCHAR&&string[i]!=0) ++i;
	i--;
	while (i>=0) 	{
			number=number+pot*((int)string[i]-48);
			pot=pot*10;
			i--;
			}
return number;
}


Torgeir Dingsoyr
2/26/1998