#!/usr/local/bin/perl $file1 = $ARGV[0]; $file2 = $ARGV[1]; $file3 = $ARGV[2]; if ($#ARGV != 2) { print "Usage: $0 needs 3 arguments: Naive-Bayes.pl lastnames keynames trueornot\n"; exit(); } # 1 = IS a South asian name # 0 = IS NOT a South asian name @lastnames = ( ); #array for set of last names $counter_lastnames = 0; #Counts the No. of last names $counter_no_sa_names = 0; #Counts the No. of S. Asian Names in our training data set $counter_no_NONsa_names = 0; #Counts the No. of NON S. Asian Names in our training data @lngthnames = ( ); #Array for length of each last name @lngthkeys = ( ); #Array for length of each key #@NEWKEY = array for set of ID keys @trueornotSA = ( ); #True case of whether its SA or NOT @condprobabilitiesYTrue = ( ); #Array for Prob{X_1 = 1 | Y = 1} @condprobabilitiesYFalse = ( ); #Array for Prob{X_1 = 1 | Y = 0} open(FILE1,"<$file1"); #First open file of all possible LAST NAMES while ($line = ) { chomp $line; #getting read of "\n" push(@lastnames, $line); #Building last name array from each line $namelngth = length($line); #determining length of each last name push(@lngthnames, $namelngth); #Building length of last names $counter_lastnames++; #keeping count of the # of last names in the data set } close(FILE1); open(FILE2,"<$file2"); #Opening file containing ID keys "chat" "jee" etc. while ($line = ) { if ($line =~ /(\S+)/) #ensuring that file is of the "proper" type { @NEWKEY = split /\s+/, $line; #Splits all the ID keys into NEWKEY array } } close(FILE2); open(FILE3,"<$file3"); #Opening File of 1's or 0's (T or F) for South Asian Names while ($line = ) { chomp $line; #getting read of "\n" push(@trueornotSA, $line); #Building list of S. Asian True or False if ($line == 1) { $counter_no_sa_names++; #Number of true South Asian Names } else { $counter_no_NONsa_names++; #Number of true NON South Asian Names } } close(FILE3); $number_of_keys = scalar(@NEWKEY); #Determines the size of the array NEWKEY for ($j=0; $j < $number_of_keys; $j++) #Make the array for length of each ID key { $keycount = length(@NEWKEY[$j]); #getting the length of each tuple (ID key) push(@lngthkeys, $keycount); #Building an array for the length of each ID Key } @scoreindex = (); #keeps track of the Bayesian score for each last name @maxscorename = (); #maximum possible attainable score per name --> name length - 3 for ($i=0; $i < $counter_lastnames; $i++) #Looping through ALL the lastname lists { $score = 0; #Variable for assigning score to each test sample for ($j=0; $j < $number_of_keys; $j++) #Loop through each ID KEY (tuple) { $counter_name_length = @lngthnames[$i]; #Going through array of Last Name Lengths $counter_key_length = @lngthkeys[$j]; #Going through array of ID Key Lengths #Next we need to ENSURE that each LAST NAME length is >= the length of each ID KEY (maximum of 4) #i.e. Name is "Ray" and ID is "chakr" -- we have a problem then! if (($counter_name_length - $counter_key_length) >= 0) { for ($k=0; $k <= ($counter_name_length - $counter_key_length); $k++) { $substringkey = substr(@lastnames[$i], $k, $counter_key_length); #start @ $k and move right $counter_key_length units if ($substringkey eq @lastnames[$i]) #Next determine: names are size of the ID Keys { #print "The short lastname here is @lastnames[$i]\n"; } if (@NEWKEY[$j] eq $substringkey) { $score++; #update the score matrix } } } } push(@scoreindex, $score); #Keeping track of the final score matrix push(@maxscorename, @lngthnames[$i]-3); #keep track of MAX. possible score attainable per name = name length - min(ID KEY) } ######################TESTING PURPOSES##################################### print "Name\t\t Score\t SAorNO\n"; for ($k=0; $k < $counter_lastnames; $k++) { print "@lastnames[$k]\t@scoreindex[$k]\t@trueornotSA[$k]\n"; print "@lastnames[$k] is @trueornotSA[$k] name\n"; } ######################END OF TESTING PURPOSES##############################