############################################################################## # John Valko (jvalko@purdue.edu) # # # # Graph-based random surfer, ala Sedgewick. # # Note: input is given 1 pair per line. # ############################################################################## from numpy import zeros, array, matrix import random import sys from networkx import * RANDOM_PROB = .1 # Probability of jumping to a random page SURF_COUNT = 1000 # Iterations for simulation def get_transitions(input_file): ''' Function get_transitions takes a file as input, generates the transition graph. Returns the transition graph and the number of pages). ''' f = open(input_file) # Find out how many pages N = int(f.readline()) # Create an empty graph - directed, multiple edges G = MultiDiGraph() # Process link list, adding new nodes and edges to the graph for line in f: i,j = line.split() i,j = int(i),int(j) G.add_edge(i,j) return G, N def simulate(G, N, iters): ''' Function simulate takes the graph, number of pages, and number of iterations as input. It simulates the random surfer making iters moves from page to page; the function returns a hit count array. ''' page_hits = zeros(N) current_site = 0 # Surf's up! for k in xrange(iters): new_site = get_next_page(G, N, current_site) page_hits[new_site] += 1 current_site = new_site return page_hits def get_next_page(G, N, current_site): ''' Function get_next_page uses the graph and the surfers current position to calculate which page the surfer visits next. ''' r = random.uniform(0,1) if r > RANDOM_PROB: next_site = random.choice(G.successors(current_site)) else: next_site = random.choice(G.nodes()) return next_site def show_graph(G): from pylab import show, figure, title figure() title('Page Graph') draw_circular(G) def show_histogram(page_hits): ''' Function show_histogram displays a histogram of the results generated by the random surfer simulation using matplotlib. ''' from numpy import arange from pylab import bar, savefig, show, xticks, xlabel, ylabel, title, figure figure() left = arange(len(page_hits)) - .2 xticks(arange(len(page_hits))) bars = bar(left, page_hits, width=.4, color='red') xlabel('Page number') ylabel('Hits') title('Surf Results') #savefig('test.png') if __name__ == '__main__': from pylab import show # Ask for the input file # in_file = raw_input('Input File: ') if len(sys.argv) < 2: in_file = "mat2.txt" else: in_file = sys.argv[1] G,N = get_transitions(in_file) page_hits = simulate(G, N, SURF_COUNT) print print "Transition Graph for the", N, "pages:" print G print print "Number of page hits during the simulation" #print page_hits for i in range(N): print "Number of times page", i, "was visited:", int(page_hits[i]) print print "Total number of moves made by the web crawler:", int(sum(page_hits)) # Using matPLotLib to generate a histogram if len(G) <= 20: show_graph(G) show_histogram(page_hits) show()