#! /usr/bin/env python import networkx import pylab import numpy from bio_provided_sp09 import * def preprocess_graph(G): print 'Graph G has', len(G), 'nodes and', len(G.edges()), 'edges.' C = networkx.connected_component_subgraphs(G) print 'There are', len(C), 'connected components.' sizes = [] for g in C: pylab.figure(0) sizes.append(len(g)) max_size = max(sizes) print 'The largest component has', max_size, 'nodes.' if len(C) > 1: pylab.hist(sizes) pylab.title('Connected component sizes') pylab.xlabel('Nodes in component') pylab.ylabel('Frequency') return C[sizes.index(max_size)] def coverage(G, clusts): """Compute coverage of clustering clusts on G""" good_edges = 0 for clust in clusts: # for each cluster... good_edges+= len(G.subgraph(clust).edges()) # add no. of good edges return good_edges*1./G.number_of_edges() def performance(G, clusts): n = len(G) # how many nodes are in G? clusts = clusts[:] # copy clusts good_edges = 0 # compute "good" (intracluster) edges for clust in clusts: good_edges += len(G.subgraph(clust).edges()) missing_bad_edges = 0 # compute missing "bad" (intercluster) edges while clusts: clust1 = clusts.pop() for clust2 in clusts: # first compute number of bad edges bad_edges = len(networkx.edge_boundary(G, clust1, clust2)) # subtract from possible number of bad edges (n1*n2, ni=len(clusti)) missing_bad_edges += len(clust1) * len(clust2) - bad_edges return 2.*(good_edges + missing_bad_edges) / (n*(n-1)) def main(edges, matrix, out_prefix): import pylab G = import_edge_list(edges) GC = preprocess_graph(G) export_edge_list(G, out_prefix+"-edges.txt") clusts = mcl_clustering(GC) pylab.figure(1) sizes = [] for c in clusts: sizes.append(len(c)) pylab.title('Cluster size distribution') pylab.xlabel('Cluster size') pylab.ylabel('Frequency') pylab.hist(sizes) write_node_attributes(out_prefix+'-na.txt', clusts) cov = coverage(GC, clusts) perf = performance(GC, clusts) pylab.figure(2) pylab.bar([1,2], [cov, perf], color=['b','r']) pylab.xticks([1.4,2.4],['Coverage','Performance']) pylab.show() if __name__ == "__main__": main('gavin.txt', 'gavin-matrix.txt', 'gavin-out')