from collections import defaultdict import csv import gzip import networkx as nx import time def printt(string): t = time.time() - time_base print "[%f] %s" % (t, string) def load_gzip_csv(filename): """Helper function to load CSV data from a GZipped file""" gzfile = gzip.GzipFile(filename, 'rb') csvfile = csv.reader(gzfile) header_line = csvfile.next() printt('Reading from CSV file %s with headers %s' % (filename, header_line)); #print 'Reading from CSV file {fn} with headers {h}'.format(fn=filename, h=header_line) return csvfile ##################### # helper to resolve the "least common ancestor" # The least common ancestor is the ancestor with smallest citation depth and # latest year. We break ties by picking the paper with the smallest paper ID. ##################### def ancestor_key(x): a, d, y = x return (d, -y, a) ################## # load and clean the data ################# time_base = time.time() g = nx.DiGraph() for pid, year in load_gzip_csv('rawdata/papers.csv.gz'): g.add_node(int(pid), year=int(year)) for p1, p2 in load_gzip_csv('rawdata/cites.csv.gz'): try: src = int(p1) dst = int(p2) # Filter out bad citations, which go backwards in year if g.node[src]['year'] >= g.node[dst]['year']: g.add_edge(src, dst) except KeyError: pass printt("Read %d papers and %d valid citations" % (g.number_of_nodes(), g.number_of_edges())) #print "Read {p} papers and {c} valid citations".format(p=g.number_of_nodes(), c=g.number_of_edges()) ################### # pick a set of seed papers ################### N = 50 seeds = [p for p in g.nodes() if p <= N] printt("Chose %d seed papers (have paper id <= %d)" % (len(seeds), N)) #print "Chose {s} seed papers (have paper id <= {N})".format(s=len(seeds), N=N) ################### # find all papers that any seed paper cites, plus the depth (# citation hops) ################### citation_depths = {p: nx.single_source_shortest_path_length(g, p) for p in seeds} printt("%d seed papers cite anything" % (len(citation_depths))) #print "{r} seed papers cite anything".format(r=len(citation_depths)) ################### # find the least common ancestor for every pair of papers ################### lca = [] for p1, p2 in ((p1, p2) for p1 in seeds for p2 in seeds if p1 < p2): depths1 = citation_depths.get(p1, {}) depths2 = citation_depths.get(p2, {}) common = set(depths1).intersection(depths2) if not common: continue lca.append((p1, p2) + min(((a, max(depths1[a], depths2[a]), g.node[a]['year']) for a in common), key=ancestor_key)) printt("%d of the %d pairs of seed papers have a common ancestor" % (len(lca), len(seeds)*(len(seeds)-1)/2)) #print "{p} of the {allp} pairs of seed papers have a common ancestor".format(p=len(lca), allp=len(seeds)*(len(seeds)-1)/2) with open('results.csv', 'wb') as resultsfile: writer = csv.writer(resultsfile) writer.writerow(['p1', 'p2', 'a', 'depth', 'year']) writer.writerows(sorted(lca)) printt("Wrote %d results to results.csv" % (len(lca))) #print "Wrote {r} results to results.csv".format(r=len(lca))