-
Notifications
You must be signed in to change notification settings - Fork 0
/
pagerank.py
145 lines (108 loc) · 4.33 KB
/
pagerank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import json
import sys
from pprint import pprint
import heapq
import csv
'''
This file is designed to do pagerank on the undirected graph.
We will use the algorithm that we used for mapreduce, we just won't
be mapreducing anything.
This means that we will sum up the rankings of neighbors, divided by
the degree.
'''
# This invokes page rank.
def pageRank(topNodes, adjacencyList, weights, numSeeds):
# Right now, we don't want to change the current weights vector.
# So, we make two copies to use.
# newWeights will be used to keep track of the old weights.
newWeights = weights[:]
# updatedWeights will be used to keep track of the updated weights.
updatedWeights = newWeights[:]
# Iterating through each key in the dictionary.
# Recall that weights[key] refers to that key's current weight.
for key in adjacencyList:
neighbors = adjacencyList[key]
# In case there are no neighbors, we don't update any of the
# weights for that key.
if neighbors[0] == "":
continue
# Otherwise, we look at the contributions.
else:
currNodeIdx = int(key)
currNodeRank = newWeights[currNodeIdx]
for neighbor in neighbors:
neighborDegree = len(adjacencyList[neighbor])
neighborIdx = int(neighbor)
neighborRank = newWeights[neighborIdx]
currNodeRank += neighborRank / float(neighborDegree)
updatedWeights[currNodeIdx] = currNodeRank
# Ensuring that we don't keep changing things over and over.
newWeights = updatedWeights[:]
updatedWeights = newWeights[:]
# Finally, setting the value of the weights.
# You could also do weights = updatedWeights I think, but
# I'm doing this to play it safe.
# We can also set the top nodes this way too.
# Note: "i" denotes the node we are on, since the
# index of the weight vector corresponds to the node Id.
for i in range(len(weights)):
weights[i] = updatedWeights[i]
# Finding top nodes.
# Condition if the topNodes isn't fully populated.
if len(topNodes) < numSeeds:
heapq.heappush(topNodes, i)
# Condition if we find a node with rank larger than the
# current minimum.
elif updatedWeights[i] > topNodes[0]:
heapq.heappop(topNodes)
heapq.heappush(topNodes, i)
# Now, sorting.
top.sort(reverse = True)
# This basically just parses the stuff, and then invokes PageRank
def run(rounds = 2):
# This is the JSON file that will be parsed.
JSONfile = sys.argv[1]
# Grabbing the weight file.
weightFile = sys.argv[2]
# Recall that the JSON file is in the form:
# num_players.num_seeds.uniqueID
JSONinfo = JSONfile.split('.')
numPlayers = int(JSONinfo[0])
numSeeds = int(JSONinfo[1])
uniqueID = int(JSONinfo[2])
with open(JSONfile) as dataFile:
JSONdata = json.load(dataFile)
# Storing the top nodes for optimal choices.
topNodes = []
# Storing the adjacency matrix of the entire thing.
adjacencyList = {}
# Storing the weights of the nodes in this line.
weights = []
with open(weightFile, 'rb') as csvFile:
weightReader = csv.reader(csvFile, delimiter=',', quotechar='"')
# This weight file will only be one line.
for row in weightReader:
for weight in row:
weights.append(float(weight))
# Populating this beautiful dictionary.
for key, val in JSONdata.iteritems():
adjacencyList[key] = val
# After this function, topNodes should be populated, as well as weights.
# topNodes should be an array of ints
# weights should be an array of floats.
pageRank(topNodes, adjacencyList, weights, numSeeds)
outFile = open("output.txt", "w")
for i in range(rounds):
for node in topNodes:
outFile.write("%d\n" % node)
if __name__ == '__main__':
# If the number of arguments isn't correct, we print a usage thing.
if len(sys.argv) < 3:
print "Usage: python", sys.argv[0], "GRAPH.json WEIGHTS.csv [ITERATIONS]\n" \
"Example: python", sys.argv[0], "2.5.1.json weights.csv"
sys.exit()
if len(sys.argv) == 4:
rounds = int(sys.argv[3])
run(rounds = rounds)
else:
run()