Differences

This shows you the differences between two versions of the page.

 cs190c:lab13_09 [2009/04/17 11:34]tang created cs190c:lab13_09 [2009/04/29 23:02] (current)tang 2009/04/29 23:02 tang 2009/04/17 11:34 tang created 2009/04/29 23:02 tang 2009/04/17 11:34 tang created Line 69: Line 69: * {{cs190c:​shortquery.txt|}} has 10 words in it * {{cs190c:​shortquery.txt|}} has 10 words in it + ===== Solution ===== + ​import pylab + import sys + from time import time, clock + from bisect import bisect_right,​ bisect_left + + if not sys.platform == '​win32':​ + clock = time + + def sorted_count(doc_list_sorted,​ x): + """​ + Given a sorted list and an element x, returns the number of occurences of + x in the sorted list. + """​ + return bisect_right(doc_list_sorted,​ x) - bisect_left(doc_list_sorted,​ x) + ​ + if __name__ == '​__main__':​ + files = ['​the_prince.txt',​ '​frankenstein.txt',​ '​war_and_peace.txt'​] + querylist = '​querylist.txt'​ + # If you want a shorter set of data to test, comment out the above two lines + # and uncomment the below two. Depending on your graph style, you may need + # to change them, since matplotlib will not draw a line with a single point. + #files = ['​shortfile.txt'​] + #querylist = '​shortquery.txt'​ + query_list = [] + time_list = [] + time_list_query = [] + time_list_sorted = [] + time_list_sorted_query = [] + time_dict = [] + time_dict_query = [] + x_axis = [] + ​ + f = open(querylist,​ '​r'​) + for line in f: + query_list.append(line.rstrip()) + ​ + for file in files: + # The below code should require no modification. It creates the unsorted + # list and sorted list and performs the query for both. + start = clock() + f = open(file, '​r'​) + doc_list = [] + for line in f: + words = line.split() + for word in words: + doc_list.append(word) + end = clock() + time_list.append(end - start) + start = clock() + for query in query_list: + doc_list.count(query) + end = clock() + time_list_query.append(end - start) + ​ + start = clock() + f = open(file, '​r'​) + doc_list_sorted = [] + for line in f: + words = line.split() + for word in words: + doc_list_sorted.append(word) + doc_list_sorted.sort() + end = clock() + time_list_sorted.append(end - start) + start = clock() + for query in query_list: + sorted_count(doc_list_sorted,​ query) + end = clock() + time_list_sorted_query.append(end - start) + + # Put your code for creating and querying the dictionary here. See the + # lab spec for how to deal with creating and querying the dictionary + # structure, if you are having problems with it. You should re-open + # the file and use the pattern above for creating the dictionary. You + # should also record the number of words that you come across and store + # it in the x_axis list. + start = clock() + doc_dict = {} + f = open(file, '​r'​) + for line in f: + words = line.split() + for word in words: + doc_dict[word] = doc_dict.get(word,​ 0) + 1 + end = clock() + time_dict.append(end - start) + start = clock() + for query in query_list: + doc_dict.get(query,​ 0) + end = clock() + time_dict_query.append(end - start) + x_axis.append(len(doc_list)) + + # Put your explanation here as to why the times are as observed. Also put + # Your graph drawing here. Ideally you should include a legend to show + # what lines are what timings. + pylab.plot(x_axis,​ time_list, '​r--',​ label='​doc creation'​) + pylab.plot(x_axis,​ time_list_sorted,​ '​g--',​ label='​sorted creation'​) + pylab.plot(x_axis,​ time_dict, '​b--',​ label='​dict creation'​) + pylab.plot(x_axis,​ time_list_query,​ '​r-',​ label='​doc query'​) + pylab.plot(x_axis,​ time_list_sorted_query,​ '​g-',​ label='​sorted query'​) + pylab.plot(x_axis,​ time_dict_query,​ '​b-',​ label='​dict query'​) + pylab.legend(loc=0) + pylab.show() +