//-------------------------------------------------------------------------
// CBR-DM Integration
// A prototype to show a possible implementation-method.
//
// Torgeir Dingsoyr 1997/1998
//
// History:
// V0: 26.12.97 - Shell
// V1: 02.02.98 - Loads casebase
// V2: 04.02.98 - Case management ok, prints cases.
// V3: 05.02.98 - Simple similarity metric.
// V4: 06.02.98 - Uses data from bayesian network in computation of
// similarity metric.
// V5: 08.02.98 - Case management from memory.
// V6: 08.02.98 - Speed-up, bugs fixed.
//
// Input files:
// DBFILE - tab-separated casebase. First line is feature name,
// second feature type ([I]nteger or [S]ymbol), third line
// is feature range. NUMCASES cases follow.
// BAYESFILE - Bayesian network file in the following format:
// Prob<Tab>DestFeat<Tab>FeatureVal<Tab>...
// Where Prob is P(DestFeat|FeatureValues).
// Feature values without influence is labeles "NA:
// The file consists of NUMBAYES lines. The order of the
// features must be the same as in the casebase.
// QUESTFILE - A tab-separated file containing "new cases" that will
// be compared to all cases in DBFILE.
//
// Output file:
// OUTFILE - A tab-separated file containing all features, and the
// case number for the NUMRET most similar cases when
// comparing the cases in DBFILE to the cases in QUESTFILE.
//-------------------------------------------------------------------------
#include <stdio.h>
#include <iostream.h>
#include <fstream.h>
#include <math.h>
#include <string.h>
#define NUMCHAR 20 // Number of characters per feature.
#define NUMFEAT 8 // Number of features in casebase
#define NUMCASES 4646 // Number of cases in casebase.
#define NUMRET 1 // Cases to retrieve.
#define NUMBAYES 182 // Lines of Bayes data structure
#define TRUE 1
#define FALSE 0
#define OFFSET 2 // 2 lines with info before data in casebase-file
#define QUESTCASE 45 // # of "new cases" to match with casebase
#define DBFILE "recomp2.db"
#define BAYESFILE "recomp2.bn"
#define OUTFILE "results.txt"
#define QUESTFILE "newcases.txt"
#define DBSIZE 220632 // Size of casebase file.
#define BAYSIZE 7060 // Size of Bayesian network file.
//-------------------------------------------------------------------------
// CASE structure. (Simple)
//-------------------------------------------------------------------------
struct acase
{
char Feature[NUMFEAT+1][NUMCHAR];
int CaseNum;
};
struct casebase
{
acase description; // Feature name
acase type; // Feature type: I-Integer, S-Symbol
acase range; // Feature range for integer (0-"range value")
int weights[NUMFEAT]; // Feature weight
acase curcase; // Current case values
};
//-------------------------------------------------------------------------
// Bayesian network structure
//-------------------------------------------------------------------------
struct aprob
{
int Probability; // States P(Destfeat|case)
int DestFeature;
char Feature[NUMFEAT+3][NUMCHAR];
};
//-------------------------------------------------------------------------
// Functions used
//-------------------------------------------------------------------------
acase GetCaseD(int cnumber);
void PrintCase(casebase cases);
acase InputCase(casebase cases);
double SimMetric(acase casex, acase casey, casebase cb);
double f(int i, acase casex, acase casey, casebase cb);
void InsertB(int j, double sim, int i);
void Retrieve(casebase cb, acase newcase);
double FindBayesProb(int i, acase casey, acase casex, casebase cb);
double SearchBayesD(acase curcase, int l);
double SearchBayes(acase curcase, int l);
void LoadBayes();
int Compare(aprob tempprob, acase curcase);
int chartoint(char string[NUMCHAR]);
acase GetNextCase(int cnumber);
void SaveCase(casebase thecase);
acase GetCase(int cnumber);
void GetCaseBase();
void RetrieveAll(casebase cb, acase newcase);
//-------------------------------------------------------------------------
// Global variables
//-------------------------------------------------------------------------
double Simbuff[NUMRET];
int Casebuff[NUMRET];
char huge DataFile[DBSIZE];
char BayesFile[BAYSIZE];
long CaseOffset[NUMCASES+OFFSET];
long BayesOffset[NUMBAYES];
//-------------------------------------------------------------------------
// Main
//-------------------------------------------------------------------------
main()
{
cout << endl << "Integrated CBR-DM System. 1998 TD" << endl;
casebase caseb;
caseb.description = GetCaseD(-2);
caseb.type = GetCaseD(-1);
caseb.range = GetCaseD(0);
LoadBayes();
// acase incase = InputCase(caseb);
// Get a new case from question database
for (int c=0;c<QUESTCASE;c++)
{
cout << c << endl;
acase incase = GetNextCase(c);
RetrieveAll(caseb, incase);
// SaveCase(incase);
// Save NUMRET best matches.
for (int n=0;n<NUMRET;n++)
{
caseb.curcase = GetCaseD(Casebuff[n]);
SaveCase(caseb);
}
}
}
//-------------------------------------------------------------------------
// Prints the case to standard output.
// V1. 04.02.98 TD.
//-------------------------------------------------------------------------
void PrintCase(casebase cases)
{
cout << "CASE number" << cases.curcase.CaseNum << endl;
for (int i=0;i<NUMFEAT+1;i++)
{
cout << cases.description.Feature[i] << '\t' << '\t';
cout << cases.curcase.Feature[i];
cout << endl << endl;
}
}
//-------------------------------------------------------------------------
// Saves the case to OUTFILE.
// V1. 07.02.98 TD.
// V2. 09.02.98 TD. Single line feature output.
//-------------------------------------------------------------------------
void SaveCase(casebase thecase)
{
ofstream outFile(OUTFILE,ios::app);
if (!outFile)
cout << "File output error";
for (int i=0;i<NUMFEAT+1;i++)
{
outFile << thecase.curcase.Feature[i] << "\t";
}
outFile << thecase.curcase.CaseNum << endl;
}
//-------------------------------------------------------------------------
// Get a case from disk.
// V1. 04.02.98 TD. Extremely inefficient and time-consuming.
// V2. 06.02.98 TD. Bug removed. Offset introduced.
//-------------------------------------------------------------------------
acase GetCaseD(int cnumber)
{
cnumber = cnumber+OFFSET; // Offset for case description in file.
acase tempcase;
tempcase.CaseNum = cnumber;
ifstream inFile(DBFILE, ios::in);
if (!inFile)
cout << "File error.";
for (int k=0;k<(cnumber+1);k++)
{
for (int j=0; j<(NUMFEAT+1); j++)
{
char ch;
int i=0;
while ( inFile.get(ch) && (ch != '\t' && ch != '\n'))
{
// cout << ch;
tempcase.Feature[j][i] = ch;
i++;
}
tempcase.Feature[j][i]=0;
}
}
return tempcase;
}
//-------------------------------------------------------------------------
// Get a casebase from disk.
// V1. 08.02.98 TD.
//-------------------------------------------------------------------------
void GetCaseBase()
{
ifstream inFile(DBFILE, ios::in);
if (!inFile)
cout << "File error.";
CaseOffset[0]=0;
int l=1;
char ch;
for (long k=0;l<NUMCASES;k++)
{
inFile.get(ch);
DataFile[k]=ch;
if (ch=='\n')
{
CaseOffset[l]=k+1;
l++;
}
}
}
//-------------------------------------------------------------------------
// Get a case from memory.
// V1. 08.02.98 TD.
//-------------------------------------------------------------------------
acase GetCase(int cnumber)
{
cnumber = cnumber+OFFSET; // Offset for case description in file.
acase tempcase;
tempcase.CaseNum = cnumber;
long k = CaseOffset[cnumber];
int l = 0;
for (int j=0; j<(NUMFEAT+1); j++)
{
char ch;
int i=0;
ch=DataFile[k+l];
while (ch != '\t' && ch != '\n')
{
// cout << ch;
tempcase.Feature[j][i] = ch;
i++;
l++;
ch=DataFile[k+l];
}
l++;
tempcase.Feature[j][i]=0;
}
return tempcase;
}
//-------------------------------------------------------------------------
// Get a new case from disk. ("question")
// V1. 07.02.98 TD.
//-------------------------------------------------------------------------
acase GetNextCase(int cnumber)
{
acase tempcase;
ifstream inFile(QUESTFILE, ios::in);
if (!inFile)
cout << "File error.";
for (int k=0;k<(cnumber+1);k++)
{
for (int j=0; j<(NUMFEAT+1); j++)
{
char ch;
int i=0;
while ( inFile.get(ch) && (ch != '\t' && ch != '\n'))
{
// cout << ch;
tempcase.Feature[j][i] = ch;
i++;
}
tempcase.Feature[j][i]=0;
}
}
return tempcase;
}
//-------------------------------------------------------------------------
// Input a new case from user.
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
acase InputCase(casebase cases)
{
acase Newcase;
cout << "Input new case";
for (int i=0;i<NUMFEAT+1;i++)
{
cout << cases.description.Feature[i] << " = ";
cin >> Newcase.Feature[i];
}
return Newcase;
}
//-------------------------------------------------------------------------
// Retrieve the NUMRET most similar cases.
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
void RetrieveAll(casebase cb, acase newcase)
{
acase tempcase;
double sim;
int ins;
for (int i=0;i<NUMRET;i++)
Simbuff[i] = Casebuff[i] = 0;
ifstream inFile(DBFILE, ios::in);
if (!inFile)
cout << "File error.";
for (int k=0;k<(NUMCASES+1);k++)
{
for (int j=0; j<(NUMFEAT+1); j++)
{
char ch;
int i=0;
while ( inFile.get(ch) && (ch != '\t' && ch != '\n'))
{
// cout << ch;
tempcase.Feature[j][i] = ch;
i++;
}
tempcase.Feature[j][i]=0;
}
if (k % 100 == 0) cout << "\t" << k << endl;
cb.curcase = tempcase;
sim = SimMetric(newcase, cb.curcase,cb);
int l=0;
ins = FALSE;
while (l<NUMRET && !ins && k> OFFSET)
{
if (sim >= Simbuff[l]) {
InsertB(l, sim,k-OFFSET);
ins=TRUE;
}
l++;
}
}
}
//-------------------------------------------------------------------------
// Retrieve the NUMRET most similar cases.
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
void Retrieve(casebase cb, acase newcase)
{
double sim;
int ins;
// Clear buffers
for (int i=0;i<NUMRET;i++)
Simbuff[i] = Casebuff[i] = 0;
for (int l=1;l<NUMCASES+1;l++)
{
// cout << l<< endl;
cb.curcase = GetCaseD(l);
sim = SimMetric(newcase, cb.curcase,cb);
int j=0;
ins = FALSE;
while (j<NUMRET && !ins)
{
if (sim >= Simbuff[j]) {
InsertB(j, sim,l);
ins=TRUE;
}
j++;
}
}
}
//-------------------------------------------------------------------------
// Insert case i with sim. metric sim into place j in retrieve buffer
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
void InsertB(int j, double sim, int i)
{
int k;
for (k=NUMRET-1;k>j;k--)
{
Simbuff[k] = Simbuff[k-1];
Casebuff[k] = Casebuff[k-1];
}
Simbuff[k] = sim;
Casebuff[k] = i;
}
//-------------------------------------------------------------------------
// Compute similarity metric.
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
double SimMetric(acase casex, acase casey, casebase cb)
{
double sim = 0;
for (int i=0;i<NUMFEAT+1;i++)
{
sim = sim + f(i, casex, casey, cb);
}
sim = sqrt(sim);
return(sim);
}
//-------------------------------------------------------------------------
// Similarity function.
// V1. 05.02.98 TD.
// V2. 98.92.98 TD. Bug fixed.
//-------------------------------------------------------------------------
double f(int i, acase casex, acase casey, casebase cb)
{
// if (cb.type.Feature[i][0] == 'I') // Integer
// {
// return(0.5);
// }
// else // Symbol
// {
if (strcmp(casex.Feature[i],casey.Feature[i])==0)
return(1);
if (casex.Feature[i] != casey.Feature[i])
{
if (strcmp(casex.Feature[i],"NA")==0&&
strcmp(casey.Feature[i],"NA")!=0)
{
return(FindBayesProb(i,casey,casex,cb));
}
else return(0);
}
// }
}
//-------------------------------------------------------------------------
// Find probability form a Bayes network. Ad-hoc version. Should have used
// DLLs from Microsoft Beleif Networks.
// V1. 05.02.98 TD.
//-------------------------------------------------------------------------
double FindBayesProb(int i,acase casey,acase casex,casebase cb)
{
double probab;
acase tempcase;
tempcase = casex;
strcpy(tempcase.Feature[i],casey.Feature[i]);
probab = SearchBayes(tempcase,i);
return(probab);
}
//-------------------------------------------------------------------------
// Search through a file produced from a Bayesian network for
// P(Feature_value|rest_of_features) when Feature_valus is missing (NA).
// V1. 06.02.98 TD.
//-------------------------------------------------------------------------
double SearchBayesD(acase curcase, int l)
{
double probability=0;
double maxprob;
int nummatch=0;
int maxmatch=0;
aprob tempprob;
ifstream inFile(BAYESFILE, ios::in);
if (!inFile)
return(0);
for (int k=0;k<NUMBAYES;k++)
{
for (int j=0; j<(NUMFEAT+3); j++)
{
char ch;
for (int i=0;i<NUMCHAR+1;i++)
tempprob.Feature[j][i]=0;
char f[NUMCHAR];
i=0;
while ( inFile.get(ch) && (ch != '\t' && ch != '\n'))
{
// cout << ch;
tempprob.Feature[j][i] = ch;
i++;
}
}
if (tempprob.Feature[1][0]=l&&
strcmp(tempprob.Feature[l+OFFSET],curcase.Feature[l])==0)
{
tempprob.DestFeature= (int) tempprob.Feature[1][0];
probability = chartoint(tempprob.Feature[0]);
probability = probability/1000;
nummatch = Compare(tempprob,curcase);
if (nummatch > maxmatch) {
maxprob = probability;
maxmatch = nummatch;
}
else if (nummatch == maxmatch && maxprob < probability
&& nummatch >1)
{
maxprob = probability;
}
}
}
return(maxprob);
}
//-------------------------------------------------------------------------
// Search through a file produced from a Bayesian network for
// P(Feature_value|rest_of_features) when Feature_valus is missing (NA).
// V1. 06.02.98 TD.
//-------------------------------------------------------------------------
void LoadBayes()
{
ifstream inFile(BAYESFILE, ios::in);
// if (!inFile)
BayesOffset[0]=0;
int l=1;
char ch;
for (long k=0;l<NUMBAYES;k++)
{
inFile.get(ch);
BayesFile[k]=ch;
if (ch=='\n')
{
BayesOffset[l]=k+1;
l++;
}
}
}
//-------------------------------------------------------------------------
// Search through a Bayesian network in memory for
// P(Feature_value|rest_of_features) when Feature_valus is missing (NA).
// V1. 06.02.98 TD.
//-------------------------------------------------------------------------
double SearchBayes(acase curcase, int l)
{
double probability=0;
double maxprob=0;
int nummatch=0;
int maxmatch=0;
aprob tempprob;
int m=-1; // Points to location in bayes table.
for (int k=0;k<NUMBAYES;k++)
{
for (int j=0; j<(NUMFEAT+3); j++)
{
char ch;
for (int i=0;i<NUMCHAR+1;i++)
tempprob.Feature[j][i]=0;
char f[NUMCHAR];
i=0;
m++;
ch=BayesFile[m];
while (ch != '\t' && ch != '\n')
{
tempprob.Feature[j][i] = ch;
i++;
m++;
ch=BayesFile[m];
}
}
if (tempprob.Feature[1][0]=l&&
strcmp(tempprob.Feature[l+OFFSET],curcase.Feature[l])==0)
{
tempprob.DestFeature= (int) tempprob.Feature[1][0];
probability = chartoint(tempprob.Feature[0]);
probability = probability/1000;
nummatch = Compare(tempprob,curcase);
if (nummatch > maxmatch) {
maxprob = probability;
maxmatch = nummatch;
}
else if (nummatch == maxmatch && maxprob < probability
&& nummatch >1)
{
maxprob = probability;
}
}
}
return(maxprob);
}
//-------------------------------------------------------------------------
// Find # of features that match.
// V1. 06.02.98 TD.
//-------------------------------------------------------------------------
int Compare(aprob tempprob, acase curcase)
{
int m=-1; // must always match one.
for (int i=0;i<NUMFEAT;i++)
{
if (strcmp(tempprob.Feature[i+OFFSET],curcase.Feature[i])==0 &&
strcmp(tempprob.Feature[i+OFFSET],"NA")!=0) m=m+1;
}
return(m);
}
//-------------------------------------------------------------------------
// Convert a char* to integer.
// V1. 06.02.98 TD.
//-------------------------------------------------------------------------
int chartoint(char string[NUMCHAR])
{
int number=0;
int pot=1;
int i=0;
while (i<NUMCHAR&&string[i]!=0) ++i;
i--;
while (i>=0) {
number=number+pot*((int)string[i]-48);
pot=pot*10;
i--;
}
return number;
}