#! /usr/bin/env python
 
import networkx
import pylab
import numpy
from bio_provided_sp09 import *
 
def preprocess_graph(G):
    print 'Graph G has', len(G), 'nodes and', len(G.edges()), 'edges.'
 
    C = networkx.connected_component_subgraphs(G)
 
    print 'There are', len(C), 'connected components.'
 
    sizes = []
    for g in C:
        pylab.figure(0)
        sizes.append(len(g))
 
    max_size = max(sizes)
    print 'The largest component has', max_size, 'nodes.'
 
    if len(C) > 1:
        pylab.hist(sizes)
        pylab.title('Connected component sizes')
        pylab.xlabel('Nodes in component')
        pylab.ylabel('Frequency')
 
    return C[sizes.index(max_size)]
 
def coverage(G, clusts):
    """Compute coverage of clustering clusts on G"""
    good_edges = 0
    for clust in clusts:                            # for each cluster...
        good_edges+= len(G.subgraph(clust).edges()) #   add no. of good edges
 
    return good_edges*1./G.number_of_edges()
 
def performance(G, clusts):
    n = len(G)                  # how many nodes are in G?
    clusts = clusts[:]          # copy clusts
 
    good_edges = 0              # compute "good" (intracluster) edges
    for clust in clusts:
        good_edges += len(G.subgraph(clust).edges())
 
    missing_bad_edges = 0       # compute missing "bad" (intercluster) edges
    while clusts:
        clust1 = clusts.pop()
        for clust2 in clusts:
            # first compute number of bad edges
            bad_edges  = len(networkx.edge_boundary(G, clust1, clust2))
            # subtract from possible number of bad edges (n1*n2, ni=len(clusti))
            missing_bad_edges += len(clust1) * len(clust2) - bad_edges  
 
    return 2.*(good_edges + missing_bad_edges) / (n*(n-1))
 
def main(edges, matrix, out_prefix):
    import pylab
    G = import_edge_list(edges)
    GC = preprocess_graph(G)
 
    export_edge_list(G, out_prefix+"-edges.txt")
 
    clusts = mcl_clustering(GC)
    pylab.figure(1)
    sizes = []
    for c in clusts:
        sizes.append(len(c))
    pylab.title('Cluster size distribution')
    pylab.xlabel('Cluster size')
    pylab.ylabel('Frequency')
    pylab.hist(sizes)
 
    write_node_attributes(out_prefix+'-na.txt', clusts)
 
    cov = coverage(GC, clusts)
    perf = performance(GC, clusts)
 
    pylab.figure(2)
    pylab.bar([1,2], [cov, perf], color=['b','r'])
    pylab.xticks([1.4,2.4],['Coverage','Performance'])
 
    pylab.show()
 
if __name__ == "__main__":
    main('gavin.txt', 'gavin-matrix.txt', 'gavin-out')
 
cs190c/project4sol_09.txt · Last modified: 2009/04/20 16:49 by tang
 
Recent changes RSS feed Creative Commons License Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki