List of commits:
Subject Hash Author Date (UTC)
initial commit - pre colloquim state 655c77556f9d8e40b52893887cdb0d90f726fdbf Mathias Ehlert 2013-11-22 13:47:29
Initial commit f53ec7a3f25d55c53aa12c2682b216e16570cdc7 Mathias Ehlert 2013-11-22 13:37:47
Commit 655c77556f9d8e40b52893887cdb0d90f726fdbf - initial commit - pre colloquim state
Author: Mathias Ehlert
Author date (UTC): 2013-11-22 13:47
Committer name: Mathias Ehlert
Committer date (UTC): 2013-11-22 13:47
Parent(s): f53ec7a3f25d55c53aa12c2682b216e16570cdc7
Signer:
Signing key:
Signing status: N
Tree: b66c7ee4e0ce05cf15db3f09fcf7a86cbb42b892
File Lines added Lines deleted
__init__.py 0 0
data/Dataset_2012.txt 244757 0
data/test_dataset.txt 10 0
file_importer.py 28 0
file_importer.pyc 0 0
metric_calculator.py 395 0
metric_calculator.pyc 0 0
profiling.py 43 0
run.py 40 0
test.py 17 0
File __init__.py added (mode: 100644) (index 0000000..e69de29)
The diff for file data/Dataset_2012.txt is too big (244757 changes) and cannot be shown.
File data/test_dataset.txt added (mode: 100644) (index 0000000..47cab8b)
1 1 2
2 1 3
3 1 4
4 2 5
5 2 6
6 3 4
7 3 6
8 4 7
9 5 8
10 6 9
File file_importer.py added (mode: 100644) (index 0000000..9796f9f)
1 import networkx as nx
2 import redis as rd
3
4 class FileImporter(object):
5 def __init__(self,filename):
6 self.data_file = open(filename)
7 self.all_nodes = []
8 self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
9 self.graph = nx.Graph()
10
11 def read(self):
12 self.redis.flushdb()
13 for line in self.data_file:
14 self.parse_line(line)
15 self.save_all_nodes()
16 return self.graph
17
18 def parse_line(self, line):
19 fields = line.strip().split("\t")
20 from_node = int(fields[0])
21 to_node = int(fields[1])
22 self.all_nodes.extend([from_node,to_node])
23 self.graph.add_edge(from_node, to_node)
24
25 def save_all_nodes(self):
26 self.unique_nodes = list(set(self.all_nodes))
27 self.unique_nodes.sort()
28 self.redis.sadd('all_nodes', *self.unique_nodes)
File file_importer.pyc added (mode: 100644) (index 0000000..9e675cc)
File metric_calculator.py added (mode: 100644) (index 0000000..a573c4e)
1 import networkx as nx
2 import redis as rd
3 import numpy as np
4
5
6 class MetricCalculator(object):
7 def __init__ (self, graph):
8 self.graph = graph
9 self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
10 self.nodes = nx.nodes(graph)
11
12 self.node_neighbors_prefix = 'node_neighbors:'
13 self.node_prefix = 'node_metrics:'
14 self.normalization_suffix = '_normalized'
15
16 # definition of all base metrics for which absolute values will be calculcated for each node in the first step
17 # key is the name of the metric and value is the implemented method which exposes the required interface
18 # interface: each method takes the node as the single parameter, performs the necessary calculation and
19 # returns a float containing the value for the specified node
20
21 self.metrics = { 'clustering_coefficient' : self.clustering_coefficient,
22 'degree' : self.degree,
23 'average_neighbor_degree' : self.average_neighbor_degree,
24 'iterated_average_neighbor_degree': self.iterated_average_neighbor_degree,
25 'betweenness_centrality' : self.betweenness_centrality,
26 'eccentricity' : self.eccentricity,
27 'average_shortest_path_length' : self.average_shortest_path_length
28 }
29
30
31 # for the frontend
32 # self.metric_names = {
33 # 'clustering_coefficient' : 'Clustering Coefficient',
34 # 'degree' : 'Node Degree',
35 # 'average_neighbor_degree' : 'Average Neighbor Node Degree',
36 # 'iterated_average_neighbor_degree': 'Iterated Average Neighbor Node Degree',
37 # 'betweenness_centrality' : 'Betweenness Centrality',
38 # 'eccentricity' : 'Node Eccentricity',
39 # 'average_shortest_path_length' : 'Average Shortest Path Length'
40 # }
41
42
43 # some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
44 # key is the metric name and value the method for correction
45
46
47 self.corrections = {'corrected_clustering_coefficient' : self.correct_clustering_coefficient,
48 'corrected_average_neighbor_degree' : self.correct_average_neighbor_degree,
49 'corrected_iterated_average_neighbor_degree': self.correct_iterated_average_neighbor_degree}
50
51
52
53 # for every metric, a normalization method has to be specified
54 # key is the name of the metric and value is the normalization method which also has to expose the required interface
55 # interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
56 # the method itself shall access the data which is required for normalization from the redis instance
57 # and the corresponding keys/values for the specified metric
58 # it shall then loop over all nodes and calculate the normalized value for the node and the metric
59 # afterwards it should save the result to redis using "metric_name_normalized" as the key
60 # the result is stored inside the node's hash for metrics
61
62 # also needs to include corrected metrics with their respective names
63 #
64 self.normalization_methods = { 'clustering_coefficient' : self.min_max_normalization,
65 'corrected_clustering_coefficient' : self.min_max_normalization,
66 'degree' : self.min_max_normalization,
67 'average_neighbor_degree' : self.min_max_normalization,
68 'corrected_average_neighbor_degree' : self.min_max_normalization,
69 'iterated_average_neighbor_degree' : self.min_max_normalization,
70 'corrected_iterated_average_neighbor_degree': self.min_max_normalization,
71 'betweenness_centrality' : self.min_max_normalization,
72 'eccentricity' : self.inverse_min_max_normalization,
73 'average_shortest_path_length' : self.inverse_min_max_normalization
74 }
75
76
77 # the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
78 # such scores can easily be defined here
79
80 #self.scores = ['unified_risk_score']
81
82 self.scores = {'unified_risk_score': { #'corrected_clustering_coefficient': 0.2,
83 'degree_normalized': 0.25,
84 'corrected_average_neighbor_degree_normalized': 0.15,
85 'corrected_iterated_average_neighbor_degree_normalized': 0.1,
86 'betweenness_centrality_normalized': 0.25,
87 'eccentricity_normalized': 0.125,
88 'average_shortest_path_length_normalized': 0.125}
89 }
90
91
92 # other scores might require a more sophisticated algorithm to be calculated
93 # such scores need to be added here and implemented like the example below
94
95 self.advanced_scores = {'advanced_unified_risk_score': self.urs_clustering_coefficient_modification}
96
97
98
99 def start(self):
100
101 self.store_neighbors()
102 self.calculate_metrics()
103 self.calculate_corrections()
104 self.normalize_metrics()
105 self.calculate_scores()
106 self.calculate_advanced_scores()
107
108
109
110 # write list of neighbors of each node to redis for navigation purposes in frontend
111 def store_neighbors(self):
112 for node in self.nodes:
113 node_neighbors = self.graph.neighbors(int(node))
114 self.redis.sadd(self.node_neighbors_prefix+str(node), *node_neighbors)
115
116 # loop through all defined metrics and call specified calculation method for each node
117 def calculate_metrics(self):
118 for metric_name in self.metrics:
119 metric_method = self.metrics[metric_name]
120
121 # loop through all nodes
122 for node in self.nodes:
123
124 # call calculation method of supplied metric for current node
125 node = int(node)
126 value = float(metric_method(node))
127
128 #store result in node values
129 self.redis.hset(self.node_prefix+str(node), metric_name, value)
130
131 #also store result to metric set
132 self.redis.zadd(metric_name, value, str(node))
133
134 # loop through all defined corrections and call specified calculation method
135 def calculate_corrections(self):
136 for correction_name in self.corrections:
137 correction_method = self.corrections[correction_name]
138 for node in self.nodes:
139 node = int(node)
140 value = float(correction_method(node))
141
142 #store result in node values
143 self.redis.hset(self.node_prefix+str(node), correction_name, value)
144
145 #also store result to metric set
146 self.redis.zadd(correction_name, value, str(node))
147
148
149 # loop through all defined normalizations and call respective normalization method
150 # no default normalizations for metrics not listed in the "normalization_methods" hash
151 def normalize_metrics(self):
152 for metric_name in self.normalization_methods:
153 normalization_method = self.normalization_methods[metric_name]
154 normalization_method(metric_name)
155
156 # normalizations
157 # min max normalization
158 def min_max_normalization(self,metric_name):
159 #perform min max normalization of specified metric for all nodes
160 #min_max normalization
161 #get min and max from redis
162 x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
163 x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
164
165 #print x_min
166 #print x_max
167
168 for node in self.nodes:
169 if x_min == x_max:
170 x_normalized = 1.0
171 else:
172 x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
173 x_normalized = (x - x_min) / (x_max - x_min)
174
175 #store value for node and metric
176 self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
177 self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
178
179 #max min normalization
180 def inverse_min_max_normalization(self,metric_name):
181 x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
182 x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
183
184 for node in self.nodes:
185 if x_min == x_max:
186 x_normalized = 1.0
187 else:
188 x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
189 x_normalized = (x_max - x) / (x_max - x_min)
190
191 #store value for node and metric
192 self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
193 self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
194
195
196 def calculate_scores(self):
197 for score_name in self.scores:
198 metrics_with_weights = self.scores[score_name]
199
200 for node in self.nodes:
201 score_value = 0.0
202
203 for metric in metrics_with_weights:
204 weight = self.scores[score_name][metric]
205 value = float(self.redis.hget(self.node_prefix+str(node),metric))
206 score_value += weight * value
207
208 self.redis.hset(self.node_prefix+str(node),score_name, score_value)
209 self.redis.zadd(score_name, score_value, str(node))
210
211 def calculate_advanced_scores(self):
212 for advanced_score in self.advanced_scores:
213 self.advanced_scores[advanced_score]()
214
215
216 ###################################################
217 # actual metrics and corrections etc. below
218 # must return value which can be converted to float
219
220 def clustering_coefficient(self,node):
221 #in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
222 #NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
223 # in this case, just returning nx.clustering(self.graph, node) might be better
224 if not hasattr(self, 'all_clustering_coefficients'):
225 self.all_clustering_coefficients = nx.clustering(self.graph)
226
227 #get the actual value from the pre-calculated hash
228 return self.all_clustering_coefficients[node]
229
230 def degree(self, node):
231 return self.graph.degree(node)
232
233
234 def average_neighbor_degree(self,node):
235 # same caching technique as in self.clustering_coefficient
236 # might also break for very large graphs
237 # nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
238
239 if not hasattr(self, 'all_average_neighbor_degrees'):
240 self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
241 return self.all_average_neighbor_degrees[node]
242
243 def iterated_average_neighbor_degree(self, node):
244
245 first_level_neighbors = self.graph.neighbors(node)
246 second_level_neighbors = []
247
248 # get all two-hop nodes
249 for first_level_neighbor in first_level_neighbors:
250 current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
251 second_level_neighbors.extend(current_second_level_neighbors)
252
253 #remove one-hop nodes and self
254 relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
255
256 degree_sum = 0
257 for relevant_node in relevant_nodes:
258 degree_sum += self.graph.degree(relevant_node)
259
260 return float(degree_sum)/float(len(relevant_nodes))
261
262 def betweenness_centrality(self, node):
263 if not hasattr(self, 'all_betweenness_centralities'):
264 self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
265 return self.all_betweenness_centralities[node]
266
267 def eccentricity(self, node):
268 if not hasattr(self, 'all_eccentricities'):
269 self.all_eccentricities = nx.eccentricity(self.graph)
270 return self.all_eccentricities[node]
271
272 def average_shortest_path_length(self, node):
273 # caching average_shortest_path_length for all nodes at one failed
274 # already switched to single calculation
275
276 #get all shortest path lengths
277 all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
278
279 #calculate average
280 sum_of_lengths = 0
281 for target in all_shortest_path_lengths_for_node:
282 sum_of_lengths += all_shortest_path_lengths_for_node[target]
283
284 return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
285
286
287 #############
288 # corrections
289 #############
290 def correct_clustering_coefficient(self,node):
291 clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
292 degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
293 corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
294
295 return corrected_cc
296
297 #def correct_clustering_coefficient(self):
298
299 # for node in self.nodes:
300 # clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
301 # degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
302
303 # corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
304
305 # self.redis.hset(self.node_prefix+str(node), 'corrected_clustering_coefficient', corrected_cc)
306 # self.redis.zadd('corrected_clustering_coefficient', corrected_cc, str(node))
307
308 def correct_average_neighbor_degree(self,node):
309 avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
310
311 neighbors = self.graph.neighbors(node)
312 number_of_neighbors = float(len(neighbors))
313 neighbor_degrees = []
314 for neighbor in neighbors:
315 neighbor_degrees.append(self.graph.degree(neighbor))
316
317 #using numpy median and standard deviation implementation
318 numpy_neighbor_degrees = np.array(neighbor_degrees)
319 median = np.median(numpy_neighbor_degrees)
320 standard_deviation = np.std(numpy_neighbor_degrees)
321
322 if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
323 return avgnd
324 else:
325 return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
326
327 #return 18
328
329 def correct_iterated_average_neighbor_degree(self, node):
330 avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
331
332 first_level_neighbors = self.graph.neighbors(node)
333 second_level_neighbors = []
334
335 # get all two-hop nodes
336 for first_level_neighbor in first_level_neighbors:
337 current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
338 second_level_neighbors.extend(current_second_level_neighbors)
339
340 #remove one-hop neighbors and self
341 relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
342
343 number_of_nodes = len(relevant_nodes)
344 node_degrees = []
345 for rel_node in relevant_nodes:
346 node_degrees.append(self.graph.degree(rel_node))
347
348 numpy_node_degrees = np.array(node_degrees)
349 median = np.median(numpy_node_degrees)
350 standard_deviation = np.std(numpy_node_degrees)
351
352 if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
353 return avgnd
354 else:
355 return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
356
357
358
359
360 ################
361 #advanced scores
362 ################
363
364 def urs_clustering_coefficient_modification(self):
365
366 #caching of values
367 all_ccs_normalized = dict(self.redis.zrange('corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
368 all_urs = dict(self.redis.zrange('unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
369
370 urs_percentile_10 = np.percentile(all_urs.values(), 10)
371 urs_percentile_90 = np.percentile(all_urs.values(), 90)
372
373 for node in self.nodes:
374 #cc_normalized = float(self.redis.hget(self.node_prefix+str(node),'corrected_clustering_coefficient'+self.normalization_suffix))
375 #urs = float(self.redis.hget(self.node_prefix+str(node),'unified_risk_score'))
376
377 cc_normalized = all_ccs_normalized[str(node)]
378 urs = all_urs[str(node)]
379
380
381
382 if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
383 if (cc_normalized >= 0.25):
384 advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
385 else:
386 advanced_unified_risk_score = urs
387 else:
388 advanced_unified_risk_score = urs
389
390 #save for node
391 self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
392 #save for metric
393 self.redis.zadd('advanced_unified_risk_score', advanced_unified_risk_score, str(node))
394
395
File metric_calculator.pyc added (mode: 100644) (index 0000000..ff74136)
File profiling.py added (mode: 100644) (index 0000000..798f46e)
1 from metric_calculator import MetricCalculator
2 import networkx as nx
3 import redis as rd
4
5 import cProfile, pstats, StringIO
6
7 redis = rd.StrictRedis(host='localhost', port=6379, db=0)
8
9 #random_runs = [[100,0.2],[100,0.3]]
10 random_runs = [[1000,0.05],[1000,0.1],[1000,0.2],[10000,0.3],[1000,0.4],[2000,0.2],[3000,0.2],[4000,0.2],[5000,0.2],[6000,0.2]]
11
12
13 for graph_configuration in random_runs:
14
15 number_of_nodes = graph_configuration[0]
16 probability_of_connection = graph_configuration[1]
17
18 graph = nx.fast_gnp_random_graph(number_of_nodes,probability_of_connection,seed=1)
19
20 nodes = nx.nodes(graph)
21 #barabasi_albert_graph(n, m, seed=None)[source]
22
23 if not nx.is_connected(graph):
24 print "not connected"
25 sys.exit(-1)
26
27 redis.flushdb()
28 redis.sadd('all_nodes', *nodes)
29
30 mc = MetricCalculator(graph)
31
32 pr = cProfile.Profile()
33 pr.enable()
34
35 mc.start()
36
37 s = StringIO.StringIO()
38 sortby = 'cumulative'
39 ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
40 ps.print_stats()
41
42 outfile = open('auto_profiling_output_'+str(number_of_nodes)+'_'+str(probability_of_connection)+'.txt', 'w')
43 outfile.write(s.getvalue())
File run.py added (mode: 100644) (index 0000000..682220f)
1 #!/usr/bin/env python
2
3 from file_importer import FileImporter
4 from metric_calculator import MetricCalculator
5
6 import cProfile, pstats, StringIO
7
8 import networkx as nx
9 import redis as rd
10
11 # start import
12 #fi = FileImporter('data/Dataset_2012.txt')
13 #fi = FileImporter('data/test_dataset.txt')
14 #graph = fi.read()
15
16 #print "Nodes:"
17 #print graph.number_of_nodes()
18 #print "Edges:"
19 #print graph.number_of_edges()
20
21 redis = rd.StrictRedis(host='localhost', port=6379, db=0)
22 redis.flushdb()
23 all_nodes = range(1,100)
24 graph = nx.fast_gnp_random_graph(100,0.15,seed=1)
25 redis.sadd('all_nodes', *all_nodes)
26
27 mc = MetricCalculator(graph)
28
29 pr = cProfile.Profile()
30 pr.enable()
31
32 mc.start()
33
34 s = StringIO.StringIO()
35 sortby = 'cumulative'
36 ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
37 ps.print_stats()
38
39 outfile = open('profiling_run_result.txt', 'w')
40 outfile.write(s.getvalue())
File test.py added (mode: 100644) (index 0000000..853e9c1)
1 #redis test
2 import redis
3 r = redis.StrictRedis(host='localhost', port=6379, db=0)
4
5 nodes = [1,2,3,4,5,6,7,8,9]
6 for node in nodes:
7 print str(node)
8 print r.get('node:'+str(node)+':degree')
9 print r.get('node:'+str(node)+':average_neighbor_degree')
10 print r.get('node:'+str(node)+':eccentricity')
11 print r.get('node:'+str(node)+':betweenness_centrality')
12 print r.get('node:'+str(node)+':clustering_coefficient')
13 print r.get('node:'+str(node)+':average_shortest_path_length')
14
15
16
17 print r.get('all_nodes').strip('[]').split(', ').type()
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/coria/coria-backend

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/coria/coria-backend

Clone this repository using git:
git clone git://git.rocketgit.com/user/coria/coria-backend

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main