-
Notifications
You must be signed in to change notification settings - Fork 24
/
SubgraphFrequencies.scala
150 lines (118 loc) · 5.24 KB
/
SubgraphFrequencies.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/**
* @author Aapo Kyrola <[email protected]>
* @version 1.0
*
* @section LICENSE
*
* Copyright [2014] [Aapo Kyrola / Carnegie Mellon University]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Publication to cite: http://arxiv.org/abs/1403.0701
*/
package edu.cmu.graphchidb.examples
import edu.cmu.graphchidb.{Util, GraphChiDatabase, GraphChiDatabaseAdmin}
import edu.cmu.graphchidb.queries.{ResultEdge, Queries}
import scala.util.Random
import java.io.FileWriter
/**
* Example application for GraphChi-DB that samples vertices from the graph, loads for each vertex
* the induced neighborhood graph (edges that span between the neighbors of the ego-vertex, excluding the ego),
* and then samples three-vertex induces subgraphs from the neighborhood graph and counts the subgraph
* frequencies of the four possible 3-vertex graphs.
*
* It should be straightforward to extend this for 4-vertex subgraphs as well.
*
* This application is based on the following paper and allows reproducing some of the results:
*
* bibtex: @inproceedings{ugander2013subgraph,
title={Subgraph frequencies: Mapping the empirical and extremal geography of large graph collections},
author={Ugander, Johan and Backstrom, Lars and Kleinberg, Jon},
booktitle={Proceedings of the 22nd international conference on World Wide Web},
pages={1307--1318},
year={2013},
organization={International World Wide Web Conferences Steering Committee}
*/
object SubgraphFrequencies {
/**
// Usage:
import edu.cmu.graphchidb.examples.SubgraphFrequencies._
// To compute subgraph freqs of a given vertex neighborhood
computeThreeVertexSubgraphFrequencies(inducedNeighborhoodGraph(DB.originalToInternalId(2419)))
// To produce data similar to used in Figure 1 of Ugander et. al.:
computeDistribution(500)
// R-script to plot the results
library("scatterplot3d")
df = read.table("subgraphs.txt", sep=",", header=TRUE)
triangles <- df$edges3
oneedges <- df$edges1
empty <- df$edges0
scatterplot3d(triangles, empty, oneedges, highlight.3d = TRUE, angle = 30,
col.axis = "blue", col.grid = "lightblue", cex.axis = 1.3,
cex.lab = 1.1, main = "Subgraph Frequencies", pch = 20
)
*/
/* Input graph: change to match your graph */
val baseFilename = System.getProperty("user.home") + "/graphs/DB/livejournal/lj"
val numShards = 16
/* NOTE: to create the database, see SocialNetworkExample.scala */
implicit val DB = new GraphChiDatabase(baseFilename, numShards = numShards)
DB.initialize()
def inducedNeighborhoodGraph(internalId: Long): Set[ResultEdge] = {
val friends = DB.queryOut(internalId, 0)
val directedgraph = Queries.inducedSubgraph(friends.getInternalIds.toSet - internalId, 0) // One might be friend of himself, to remove it
// map so that src < dst and remove pointer, then transform to a set to remove duplicates
// also remove self-edges
directedgraph.map(e => ResultEdge(math.min(e.src, e.dst), math.max(e.src, e.dst), 0) ).filterNot(e => e.src == e.dst).toSet
}
/**
* Returns a four-element vector x, so that x[i] has the fraction of 3-vertex subgraphs with i edges
* @param subgraph
*/
def computeThreeVertexSubgraphFrequencies(subgraph: Set[ResultEdge], numSamples: Int = 11000) : Array[Double] = {
val freqs = new Array[Int](4)
/* Find the vertex IDs in the subgraph */
val vertices = (subgraph.map {_.src} ++ subgraph.map {_.dst}).toSet.toIndexedSeq
if (vertices.size < 4) { // Do not do trivial
return new Array[Double](4)
}
for(i <- 0 until numSamples) {
val sampledVertices = Util.randomIndices(vertices.size, 3) map { j => vertices(j) }
val sampledEdges = subgraph.filter { e => sampledVertices.contains(e.src) && sampledVertices.contains(e.dst)}
if (sampledEdges.size == 4) println(sampledEdges)
freqs(sampledEdges.size) += 1
}
freqs.map { f => f.toDouble / freqs.sum }
}
/**
* Samples N vertices from the graph and computes their subgraph freq vector. Outputs to file "subgraphs.txt".
* Skips trivial vertices (too small neighborhoods).
*/
def computeDistribution(N: Int = 500) = {
var n = 0
val filename = "subgraphs_%d.txt".format(System.currentTimeMillis())
val fw = new FileWriter(filename)
fw.write("edges0,edges1,edges2,edges3\n")
while(n < N) {
val randomVertexInternalId = DB.randomVertex()
val freqs = computeThreeVertexSubgraphFrequencies(inducedNeighborhoodGraph(randomVertexInternalId))
if (freqs.sum > 0) {
fw.write("%f,%f,%f,%f\n".format(freqs(0), freqs(1), freqs(2), freqs(3)))
n += 1
println("%d/%d".format(n, N))
}
}
fw.close()
println("Results in " + filename)
}
}