############################################################################## # John Valko (jvalko@purdue.edu) # # # # Matrix-based random surfer, ala Sedgewick. # # Note: input is given 1 pair per line. # ############################################################################## from numpy import zeros, array, matrix import random import sys RANDOM_PROB = .1 # Probability of jumping to a random page SURF_COUNT = 1000 # Iterations for simulation def get_transitions(input_file): ''' Function get_transitions takes a file as input, generates the transition_matrix, and returns the transition_matrix and the number_of_pages). ''' f = open(input_file) # Find out how many pages N = int(f.readline()) # Create two arrays to storage information about the page link structure link_counts = zeros((N,N)) out_degrees = zeros(N) # Process link list for line in f: i,j = line.split() i,j = int(i),int(j) link_counts[i][j] += 1 out_degrees[i] += 1 # Calculate the transition matrix given the 90/10 rule transition = zeros((N,N)) for i in range(N): for j in range(N): transition[i][j] = ((1-RANDOM_PROB)*link_counts[i][j]/out_degrees[i] + RANDOM_PROB/N) return transition, N def simulate(trans_mat, N, iters): ''' Function trans_mat takes the transition matrix, number of pages, and number of iterations as input. It simulates the random surfer making iters moves from page to page; the function returns a hit count array. ''' page_hits = zeros(N) current_site = 0 # Surf's up! for k in xrange(iters): new_site = get_next_page(trans_mat, N, current_site ) page_hits[new_site] += 1 current_site = new_site return page_hits def get_next_page(trans_mat, N, current_site): ''' Function get_next_page uses the transition matrix and the surfer's current position to calculate which page the surfer visits next. ''' r = random.uniform(0,1) total = 0 for k in range(N): total += trans_mat[current_site][k] if total >= r: return k def show_histogram(page_hits): ''' Function show_histogram displays a histogram of the results generated by the random surfer simulation using matplotlib. ''' from numpy import arange from pylab import bar, savefig, show, xticks, xlabel, ylabel, title left = arange(len(page_hits)) - .2 xticks(arange(len(page_hits))) bars = bar(left, page_hits, width=.4, color='red') xlabel('Page number') ylabel('Hits') title('Surf Results') #savefig('test.png') show() if __name__ == '__main__': # Ask for the input file # in_file = raw_input('Input File: ') in_file = "mat2.txt" M,N = get_transitions(in_file) page_hits = simulate(M, N, SURF_COUNT) print print "Transition Matrix for the", N, "pages:" print M print print "Number of page hits during the simulation" #print page_hits for i in range(N): print "Number of times page", i, "was visited:", int(page_hits[i]) print print "Total number of moves made by the web crawler:", int(sum(page_hits)) # Using matPLotLib to generate a histogram show_histogram(page_hits)