Differences

This shows you the differences between two versions of the page.

Link to this comparison view

cs190c:lab13_09 [2009/04/17 11:34]
tang created
cs190c:lab13_09 [2009/04/29 23:02] (current)
tang
Line 69: Line 69:
   * {{cs190c:​shortquery.txt|}} has 10 words in it   * {{cs190c:​shortquery.txt|}} has 10 words in it
  
 +===== Solution =====
 +<code python>​import pylab
 +import sys
 +from time import time, clock
 +from bisect import bisect_right,​ bisect_left
 +
 +if not sys.platform == '​win32':​
 +    clock = time
 +
 +def sorted_count(doc_list_sorted,​ x):
 +    """​
 +    Given a sorted list and an element x, returns the number of occurences of
 +    x in the sorted list.
 +    """​
 +    return bisect_right(doc_list_sorted,​ x) - bisect_left(doc_list_sorted,​ x)
 +    ​
 +if __name__ == '​__main__':​
 +    files = ['​the_prince.txt',​ '​frankenstein.txt',​ '​war_and_peace.txt'​]
 +    querylist = '​querylist.txt'​
 +    # If you want a shorter set of data to test, comment out the above two lines
 +    # and uncomment the below two. Depending on your graph style, you may need
 +    # to change them, since matplotlib will not draw a line with a single point.
 +    #files = ['​shortfile.txt'​]
 +    #querylist = '​shortquery.txt'​
 +    query_list = []
 +    time_list = []
 +    time_list_query = []
 +    time_list_sorted = []
 +    time_list_sorted_query = []
 +    time_dict = []
 +    time_dict_query = []
 +    x_axis = []
 +    ​
 +    f = open(querylist,​ '​r'​)
 +    for line in f:
 +        query_list.append(line.rstrip())
 +        ​
 +    for file in files:
 +        # The below code should require no modification. It creates the unsorted
 +        # list and sorted list and performs the query for both.
 +        start = clock()
 +        f = open(file, '​r'​)
 +        doc_list = []
 +        for line in f:
 +            words = line.split()
 +            for word in words:
 +                doc_list.append(word)
 +        end = clock()
 +        time_list.append(end - start)
 +        start = clock()
 +        for query in query_list:
 +            doc_list.count(query)
 +        end = clock()
 +        time_list_query.append(end - start)
 +    ​
 +        start = clock()
 +        f = open(file, '​r'​)
 +        doc_list_sorted = []
 +        for line in f:
 +            words = line.split()
 +            for word in words:
 +                doc_list_sorted.append(word)
 +        doc_list_sorted.sort()
 +        end = clock()
 +        time_list_sorted.append(end - start)
 +        start = clock()
 +        for query in query_list:
 +            sorted_count(doc_list_sorted,​ query)
 +        end = clock()
 +        time_list_sorted_query.append(end - start)
 +
 +        # Put your code for creating and querying the dictionary here. See the
 +        # lab spec for how to deal with creating and querying the dictionary
 +        # structure, if you are having problems with it. You should re-open
 +        # the file and use the pattern above for creating the dictionary. You
 +        # should also record the number of words that you come across and store
 +        # it in the x_axis list.
 +        start = clock()
 +        doc_dict = {}
 +        f = open(file, '​r'​)
 +        for line in f:
 +            words = line.split()
 +            for word in words:
 +                doc_dict[word] = doc_dict.get(word,​ 0) + 1
 +        end = clock()
 +        time_dict.append(end - start)
 +        start = clock()
 +        for query in query_list:
 +            doc_dict.get(query,​ 0)
 +        end = clock()
 +        time_dict_query.append(end - start)
 +        x_axis.append(len(doc_list))
 +
 +    # Put your explanation here as to why the times are as observed. Also put
 +    # Your graph drawing here. Ideally you should include a legend to show
 +    # what lines are what timings.
 +    pylab.plot(x_axis,​ time_list, '​r--',​ label='​doc creation'​)
 +    pylab.plot(x_axis,​ time_list_sorted,​ '​g--',​ label='​sorted creation'​)
 +    pylab.plot(x_axis,​ time_dict, '​b--',​ label='​dict creation'​)
 +    pylab.plot(x_axis,​ time_list_query,​ '​r-',​ label='​doc query'​)
 +    pylab.plot(x_axis,​ time_list_sorted_query,​ '​g-',​ label='​sorted query'​)
 +    pylab.plot(x_axis,​ time_dict_query,​ '​b-',​ label='​dict query'​)
 +    pylab.legend(loc=0)
 +    pylab.show()
 +</​code>​
 
cs190c/lab13_09.txt ยท Last modified: 2009/04/29 23:02 by tang
 
Recent changes RSS feed Creative Commons License Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki