List of commits:
Subject Hash Author Date (UTC)
a lot of refactoring for more modular structure dcf7bd73ccc2f871ab8d48c43d11a8e5b392b6de mcehlert 2014-01-09 15:50:53
initial commit - pre colloquim state 655c77556f9d8e40b52893887cdb0d90f726fdbf Mathias Ehlert 2013-11-22 13:47:29
Initial commit f53ec7a3f25d55c53aa12c2682b216e16570cdc7 Mathias Ehlert 2013-11-22 13:37:47
Commit dcf7bd73ccc2f871ab8d48c43d11a8e5b392b6de - a lot of refactoring for more modular structure
Author: mcehlert
Author date (UTC): 2014-01-09 15:50
Committer name: mcehlert
Committer date (UTC): 2014-01-09 15:50
Parent(s): 655c77556f9d8e40b52893887cdb0d90f726fdbf
Signer:
Signing key:
Signing status: N
Tree: 8c62df349128ce021e0230ab8e76dc0fdc363c98
File Lines added Lines deleted
file_importer.py 9 11
file_importer.pyc 0 0
metric_calculator.py 483 319
metric_calculator.pyc 0 0
profiling.py 0 43
run.py 0 40
test.py 0 17
File file_importer.py changed (mode: 100644) (index 9796f9f..3600618)
1 1 import networkx as nx import networkx as nx
2 import redis as rd
3 2
4 3 class FileImporter(object): class FileImporter(object):
4
5 5 def __init__(self,filename): def __init__(self,filename):
6 # initialize data file to parse and new empty graph
7
6 8 self.data_file = open(filename) self.data_file = open(filename)
7 self.all_nodes = []
8 self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
9 9 self.graph = nx.Graph() self.graph = nx.Graph()
10 10
11 11 def read(self): def read(self):
12 self.redis.flushdb()
13 12 for line in self.data_file: for line in self.data_file:
14 13 self.parse_line(line) self.parse_line(line)
15 self.save_all_nodes()
16 14 return self.graph return self.graph
17 15
18 16 def parse_line(self, line): def parse_line(self, line):
17 # split each line on tabstop
18 # first field specifies the source node
19 # second field specifies the target node
20
19 21 fields = line.strip().split("\t") fields = line.strip().split("\t")
20 22 from_node = int(fields[0]) from_node = int(fields[0])
21 23 to_node = int(fields[1]) to_node = int(fields[1])
22 self.all_nodes.extend([from_node,to_node])
23 self.graph.add_edge(from_node, to_node)
24 24
25 def save_all_nodes(self):
26 self.unique_nodes = list(set(self.all_nodes))
27 self.unique_nodes.sort()
28 self.redis.sadd('all_nodes', *self.unique_nodes)
25 # add edge to the graph
26 self.graph.add_edge(from_node, to_node)
File file_importer.pyc deleted (index 9e675cc..0000000)
File metric_calculator.py changed (mode: 100644) (index a573c4e..281cf20)
1 1 import networkx as nx import networkx as nx
2 2 import redis as rd import redis as rd
3 3 import numpy as np import numpy as np
4 import indexing
5 import statistics
6 import normalizations
7 import config
4 8
5 9
6 10 class MetricCalculator(object): class MetricCalculator(object):
 
... ... class MetricCalculator(object):
8 12 self.graph = graph self.graph = graph
9 13 self.redis = rd.StrictRedis(host='localhost', port=6379, db=0) self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
10 14 self.nodes = nx.nodes(graph) self.nodes = nx.nodes(graph)
11
12 self.node_neighbors_prefix = 'node_neighbors:'
13 self.node_prefix = 'node_metrics:'
14 self.normalization_suffix = '_normalized'
15
16 # definition of all base metrics for which absolute values will be calculcated for each node in the first step
17 # key is the name of the metric and value is the implemented method which exposes the required interface
18 # interface: each method takes the node as the single parameter, performs the necessary calculation and
19 # returns a float containing the value for the specified node
20
21 self.metrics = { 'clustering_coefficient' : self.clustering_coefficient,
22 'degree' : self.degree,
23 'average_neighbor_degree' : self.average_neighbor_degree,
24 'iterated_average_neighbor_degree': self.iterated_average_neighbor_degree,
25 'betweenness_centrality' : self.betweenness_centrality,
26 'eccentricity' : self.eccentricity,
27 'average_shortest_path_length' : self.average_shortest_path_length
28 }
29
30
31 # for the frontend
32 # self.metric_names = {
33 # 'clustering_coefficient' : 'Clustering Coefficient',
34 # 'degree' : 'Node Degree',
35 # 'average_neighbor_degree' : 'Average Neighbor Node Degree',
36 # 'iterated_average_neighbor_degree': 'Iterated Average Neighbor Node Degree',
37 # 'betweenness_centrality' : 'Betweenness Centrality',
38 # 'eccentricity' : 'Node Eccentricity',
39 # 'average_shortest_path_length' : 'Average Shortest Path Length'
40 # }
41
42
43 # some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
44 # key is the metric name and value the method for correction
45
46
47 self.corrections = {'corrected_clustering_coefficient' : self.correct_clustering_coefficient,
48 'corrected_average_neighbor_degree' : self.correct_average_neighbor_degree,
49 'corrected_iterated_average_neighbor_degree': self.correct_iterated_average_neighbor_degree}
50
51
52
53 # for every metric, a normalization method has to be specified
54 # key is the name of the metric and value is the normalization method which also has to expose the required interface
55 # interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
56 # the method itself shall access the data which is required for normalization from the redis instance
57 # and the corresponding keys/values for the specified metric
58 # it shall then loop over all nodes and calculate the normalized value for the node and the metric
59 # afterwards it should save the result to redis using "metric_name_normalized" as the key
60 # the result is stored inside the node's hash for metrics
61
62 # also needs to include corrected metrics with their respective names
63 #
64 self.normalization_methods = { 'clustering_coefficient' : self.min_max_normalization,
65 'corrected_clustering_coefficient' : self.min_max_normalization,
66 'degree' : self.min_max_normalization,
67 'average_neighbor_degree' : self.min_max_normalization,
68 'corrected_average_neighbor_degree' : self.min_max_normalization,
69 'iterated_average_neighbor_degree' : self.min_max_normalization,
70 'corrected_iterated_average_neighbor_degree': self.min_max_normalization,
71 'betweenness_centrality' : self.min_max_normalization,
72 'eccentricity' : self.inverse_min_max_normalization,
73 'average_shortest_path_length' : self.inverse_min_max_normalization
74 }
75
76
77 # the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
78 # such scores can easily be defined here
79
80 #self.scores = ['unified_risk_score']
81
82 self.scores = {'unified_risk_score': { #'corrected_clustering_coefficient': 0.2,
83 'degree_normalized': 0.25,
84 'corrected_average_neighbor_degree_normalized': 0.15,
85 'corrected_iterated_average_neighbor_degree_normalized': 0.1,
86 'betweenness_centrality_normalized': 0.25,
87 'eccentricity_normalized': 0.125,
88 'average_shortest_path_length_normalized': 0.125}
89 }
90 15
16 self.node_index_key = config.node_index_key
17 self.metric_index_key = config.metric_index_key
18 self.score_index_key = config.score_index_key
19
20 self.node_neighbors_prefix = config.node_neighbors_prefix
21 self.node_prefix = config.node_prefix
22 self.metric_prefix = config.metric_prefix
23 self.score_prefix = config.score_prefix
24 self.statistics_prefix = config.statistics_prefix
25
26 self.normalization_suffix = config.normalization_suffix
27
28 self.base_metrics = config.base_metrics
29 self.advanced_metrics = config.advanced_metrics
30
31 self.normalization_methods = config.normalization_methods
32
33 self.scores = config.scores
34 self.advanced_scores = config.advanced_scores
35
36
37
38 # self.node_index_key = 'all_nodes'
39 # self.metric_index_key = 'all_metrics'
40 # self.score_index_key = 'all_scores'
41 #
42 # self.node_neighbors_prefix = 'node_neighbors:'
43 # self.node_prefix = 'node_metrics:'
44 # self.metric_prefix = 'metric:'
45 # self.statistics_prefix = 'statistics:'
46 #
47 # self.normalization_suffix = '_normalized'
48 #
49 # # definition of all base metrics for which absolute values will be calculcated for each node in the first step
50 # # key is the name of the metric and value is the implemented method which exposes the required interface
51 # # interface: each method takes the node as the single parameter, performs the necessary calculation and
52 # # returns a float containing the value for the specified node
53 #
54 # self.metrics = { 'clustering_coefficient' : self.clustering_coefficient,
55 # 'degree' : self.degree,
56 # 'average_neighbor_degree' : self.average_neighbor_degree,
57 # 'iterated_average_neighbor_degree': self.iterated_average_neighbor_degree,
58 # 'betweenness_centrality' : self.betweenness_centrality,
59 # 'eccentricity' : self.eccentricity,
60 # 'average_shortest_path_length' : self.average_shortest_path_length
61 # }
62 #
63 #
64 # # some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
65 # # key is the metric name and value the method for correction
66 #
67 #
68 # self.advanced_metrics = { 'corrected_clustering_coefficient' : self.correct_clustering_coefficient,
69 # 'corrected_average_neighbor_degree' : self.correct_average_neighbor_degree,
70 # 'corrected_iterated_average_neighbor_degree': self.correct_iterated_average_neighbor_degree}
71 #
72 #
73 #
74 # # for every metric, a normalization method has to be specified
75 # # key is the name of the metric and value is the normalization method which also has to expose the required interface
76 # # interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
77 # # the method itself shall access the data which is required for normalization from the redis instance
78 # # and the corresponding keys/values for the specified metric
79 # # it shall then loop over all nodes and calculate the normalized value for the node and the metric
80 # # afterwards it should save the result to redis using "metric_name_normalized" as the key
81 # # the result is stored inside the node's hash for metrics
82 #
83 # # also needs to include corrected metrics with their respective names
84 # #
85 # self.normalization_methods = { 'clustering_coefficient' : self.min_max_normalization,
86 # 'corrected_clustering_coefficient' : self.min_max_normalization,
87 # 'degree' : self.min_max_normalization,
88 # 'average_neighbor_degree' : self.min_max_normalization,
89 # 'corrected_average_neighbor_degree' : self.min_max_normalization,
90 # 'iterated_average_neighbor_degree' : self.min_max_normalization,
91 # 'corrected_iterated_average_neighbor_degree': self.min_max_normalization,
92 # 'betweenness_centrality' : self.min_max_normalization,
93 # 'eccentricity' : self.inverse_min_max_normalization,
94 # 'average_shortest_path_length' : self.inverse_min_max_normalization
95 # }
96 #
97 #
98 # # the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
99 # # such scores can easily be defined here
100 # # note: names are not methods but redis keys
101 #
102 # self.scores = {'unified_risk_score': { #'corrected_clustering_coefficient': 0.2,
103 # 'degree_normalized': 0.25,
104 # 'corrected_average_neighbor_degree_normalized': 0.15,
105 # 'corrected_iterated_average_neighbor_degree_normalized': 0.1,
106 # 'betweenness_centrality_normalized': 0.25,
107 # 'eccentricity_normalized': 0.125,
108 # 'average_shortest_path_length_normalized': 0.125}
109 # }
110 #
111 #
112 # # other scores might require a more sophisticated algorithm to be calculated
113 # # such scores need to be added here and implemented like the example below
114 #
115 # self.advanced_scores = {'advanced_unified_risk_score': self.urs_clustering_coefficient_modification}
91 116
92 # other scores might require a more sophisticated algorithm to be calculated
93 # such scores need to be added here and implemented like the example below
94 117
95 self.advanced_scores = {'advanced_unified_risk_score': self.urs_clustering_coefficient_modification}
96 118
97 119
98 120
99 121 def start(self): def start(self):
122 #clean all data in Redis
123 self.redis.flushdb()
124
125 #index creation
126 #self.index_nodes()
127 #self.index_neighbors()
128 #self.index_metrics()
129 #self.index_scores()
130
131 self.create_indexes()
100 132
101 self.store_neighbors()
133
134 #main calculations
102 135 self.calculate_metrics() self.calculate_metrics()
103 self.calculate_corrections()
136 self.calculate_advanced_metrics()
104 137 self.normalize_metrics() self.normalize_metrics()
105 138 self.calculate_scores() self.calculate_scores()
106 139 self.calculate_advanced_scores() self.calculate_advanced_scores()
107 140
108
109
110 # write list of neighbors of each node to redis for navigation purposes in frontend
111 def store_neighbors(self):
112 for node in self.nodes:
113 node_neighbors = self.graph.neighbors(int(node))
114 self.redis.sadd(self.node_neighbors_prefix+str(node), *node_neighbors)
115
141 #statistics
142 self.calculate_statistics()
143
144 ##################
145 #### INDEXING ####
146 ##################
147 def create_indexes(self):
148 indexing.index_nodes(self)
149 indexing.index_neighbors(self)
150 indexing.index_metrics(self)
151 indexing.index_scores(self)
152
153
154 # def index_nodes(self):
155 # self.redis.sadd(self.node_index_key, *self.nodes)
156 #
157 # def index_neighbors(self):
158 # for node in self.nodes:
159 # node_neighbors = self.graph.neighbors(int(node))
160 # self.redis.sadd(self.node_neighbors_prefix+str(node), *node_neighbors)
161 #
162 # def index_metrics(self):
163 # for metric in self.metrics:
164 # self.redis.sadd(self.metric_index_key, metric)
165 #
166 # for advanced_metric in self.advanced_metrics:
167 # self.redis.sadd(self.metric_index_key, advanced_metric)
168 #
169 # def index_scores(self):
170 # for score in self.scores:
171 # self.redis.sadd(self.score_index_key, score)
172 #
173 # for advanced_score in self.advanced_scores:
174 # self.redis.sadd(self.score_index_key, advanced_score)
175
176 ###########################
177 #### CALCULATION LOOPS ####
178 ###########################
116 179 # loop through all defined metrics and call specified calculation method for each node # loop through all defined metrics and call specified calculation method for each node
117 180 def calculate_metrics(self): def calculate_metrics(self):
118 for metric_name in self.metrics:
119 metric_method = self.metrics[metric_name]
181 for metric_name in self.base_metrics:
182 metric_method = self.base_metrics[metric_name]
120 183
121 184 # loop through all nodes # loop through all nodes
122 185 for node in self.nodes: for node in self.nodes:
123 186
124 187 # call calculation method of supplied metric for current node # call calculation method of supplied metric for current node
125 188 node = int(node) node = int(node)
126 value = float(metric_method(node))
189 value = float(metric_method(self,node))
127 190
128 191 #store result in node values #store result in node values
129 192 self.redis.hset(self.node_prefix+str(node), metric_name, value) self.redis.hset(self.node_prefix+str(node), metric_name, value)
130 193
131 194 #also store result to metric set #also store result to metric set
132 self.redis.zadd(metric_name, value, str(node))
195 self.redis.zadd(self.metric_prefix+metric_name, value, str(node))
133 196
134 # loop through all defined corrections and call specified calculation method
135 def calculate_corrections(self):
136 for correction_name in self.corrections:
137 correction_method = self.corrections[correction_name]
197 # loop through all defined_advanced_metrics and call specified calculation method
198 def calculate_advanced_metrics(self):
199 for advanced_metric_name in self.advanced_metrics:
200 metric_method = self.advanced_metrics[advanced_metric_name]
138 201 for node in self.nodes: for node in self.nodes:
139 202 node = int(node) node = int(node)
140 value = float(correction_method(node))
203 value = float(metric_method(self,node))
141 204
142 205 #store result in node values #store result in node values
143 self.redis.hset(self.node_prefix+str(node), correction_name, value)
206 self.redis.hset(self.node_prefix+str(node), advanced_metric_name, value)
144 207
145 208 #also store result to metric set #also store result to metric set
146 self.redis.zadd(correction_name, value, str(node))
209 self.redis.zadd(self.metric_prefix+advanced_metric_name, value, str(node))
147 210
148 211
149 212 # loop through all defined normalizations and call respective normalization method # loop through all defined normalizations and call respective normalization method
150 213 # no default normalizations for metrics not listed in the "normalization_methods" hash # no default normalizations for metrics not listed in the "normalization_methods" hash
151 214 def normalize_metrics(self): def normalize_metrics(self):
152 for metric_name in self.normalization_methods:
153 normalization_method = self.normalization_methods[metric_name]
154 normalization_method(metric_name)
155
156 # normalizations
157 # min max normalization
158 def min_max_normalization(self,metric_name):
159 #perform min max normalization of specified metric for all nodes
160 #min_max normalization
161 #get min and max from redis
162 x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
163 x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
164
165 #print x_min
166 #print x_max
215 #fallback normalization: min-max
167 216
168 for node in self.nodes:
169 if x_min == x_max:
170 x_normalized = 1.0
217 all_metrics = dict(self.base_metrics.items() + self.advanced_metrics.items())
218
219 for metric_name in all_metrics:
220 if self.normalization_methods.has_key(metric_name):
221 normalization_method = self.normalization_methods[metric_name]
171 222 else: else:
172 x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
173 x_normalized = (x - x_min) / (x_max - x_min)
174
175 #store value for node and metric
176 self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
177 self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
178
179 #max min normalization
180 def inverse_min_max_normalization(self,metric_name):
181 x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
182 x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
223 #fallback normalization is min-max
224 normalization_method = normalizations.min_max
225 normalization_method(self,metric_name)
183 226
184 for node in self.nodes:
185 if x_min == x_max:
186 x_normalized = 1.0
187 else:
188 x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
189 x_normalized = (x_max - x) / (x_max - x_min)
190
191 #store value for node and metric
192 self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
193 self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
194 227
195 228
229
230 # # normalizations
231 # # min max normalization
232 # def min_max_normalization(self,metric_name):
233 # #perform min max normalization of specified metric for all nodes
234 # #min_max normalization
235 # #get min and max from redis
236 # x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
237 # x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
238 #
239 # #print x_min
240 # #print x_max
241 #
242 # for node in self.nodes:
243 # if x_min == x_max:
244 # x_normalized = 1.0
245 # else:
246 # x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
247 # x_normalized = (x - x_min) / (x_max - x_min)
248 #
249 # #store value for node and metric
250 # self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
251 # self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
252 #
253 # #max min normalization
254 # def inverse_min_max_normalization(self,metric_name):
255 # x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
256 # x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
257 #
258 # for node in self.nodes:
259 # if x_min == x_max:
260 # x_normalized = 1.0
261 # else:
262 # x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
263 # x_normalized = (x_max - x) / (x_max - x_min)
264 #
265 # #store value for node and metric
266 # self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
267 # self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
268 #
196 269 def calculate_scores(self): def calculate_scores(self):
197 270 for score_name in self.scores: for score_name in self.scores:
198 271 metrics_with_weights = self.scores[score_name] metrics_with_weights = self.scores[score_name]
 
... ... class MetricCalculator(object):
200 273 for node in self.nodes: for node in self.nodes:
201 274 score_value = 0.0 score_value = 0.0
202 275
276 # get normalized values
203 277 for metric in metrics_with_weights: for metric in metrics_with_weights:
204 278 weight = self.scores[score_name][metric] weight = self.scores[score_name][metric]
205 value = float(self.redis.hget(self.node_prefix+str(node),metric))
279 value = float(self.redis.hget(self.node_prefix+str(node),metric+self.normalization_suffix))
206 280 score_value += weight * value score_value += weight * value
207 281
208 282 self.redis.hset(self.node_prefix+str(node),score_name, score_value) self.redis.hset(self.node_prefix+str(node),score_name, score_value)
209 self.redis.zadd(score_name, score_value, str(node))
283 self.redis.zadd(self.score_prefix+score_name, score_value, str(node))
210 284
211 285 def calculate_advanced_scores(self): def calculate_advanced_scores(self):
212 286 for advanced_score in self.advanced_scores: for advanced_score in self.advanced_scores:
213 self.advanced_scores[advanced_score]()
287 self.advanced_scores[advanced_score](self)
214 288
215 289
216 290 ################################################### ###################################################
217 291 # actual metrics and corrections etc. below # actual metrics and corrections etc. below
218 292 # must return value which can be converted to float # must return value which can be converted to float
293 ###################################################
294 #
295 # def clustering_coefficient(self,node):
296 # #in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
297 # #NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
298 # # in this case, just returning nx.clustering(self.graph, node) might be better
299 # if not hasattr(self, 'all_clustering_coefficients'):
300 # self.all_clustering_coefficients = nx.clustering(self.graph)
301 #
302 # #get the actual value from the pre-calculated hash
303 # return self.all_clustering_coefficients[node]
304 #
305 # def degree(self, node):
306 # return self.graph.degree(node)
307 #
308 #
309 # def average_neighbor_degree(self,node):
310 # # same caching technique as in self.clustering_coefficient
311 # # might also break for very large graphs
312 # # nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
313 #
314 # if not hasattr(self, 'all_average_neighbor_degrees'):
315 # self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
316 # return self.all_average_neighbor_degrees[node]
317 #
318 # def iterated_average_neighbor_degree(self, node):
319 #
320 # first_level_neighbors = self.graph.neighbors(node)
321 # second_level_neighbors = []
322 #
323 # # get all two-hop nodes
324 # for first_level_neighbor in first_level_neighbors:
325 # current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
326 # second_level_neighbors.extend(current_second_level_neighbors)
327 #
328 # #remove one-hop nodes and self
329 # relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
330 #
331 # degree_sum = 0
332 # for relevant_node in relevant_nodes:
333 # degree_sum += self.graph.degree(relevant_node)
334 #
335 # return float(degree_sum)/float(len(relevant_nodes))
336 #
337 # def betweenness_centrality(self, node):
338 # if not hasattr(self, 'all_betweenness_centralities'):
339 # self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
340 # return self.all_betweenness_centralities[node]
341 #
342 # def eccentricity(self, node):
343 # if not hasattr(self, 'all_eccentricities'):
344 # self.all_eccentricities = nx.eccentricity(self.graph)
345 # return self.all_eccentricities[node]
346 #
347 # def average_shortest_path_length(self, node):
348 # # caching average_shortest_path_length for all nodes at one failed
349 # # already switched to single calculation
350 #
351 # #get all shortest path lengths
352 # all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
353 #
354 # #calculate average
355 # sum_of_lengths = 0
356 # for target in all_shortest_path_lengths_for_node:
357 # sum_of_lengths += all_shortest_path_lengths_for_node[target]
358 #
359 # return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
360 #
361 #
362 ##############
363 ## corrections
364 ##############
365 # def correct_clustering_coefficient(self,node):
366 # clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
367 # degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
368 # corrected_cc = clustering_coefficient + (degree * clustering_coefficient) / float(4)
369 #
370 # return corrected_cc
371 #
372 # #def correct_clustering_coefficient(self):
373 #
374 # # for node in self.nodes:
375 # # clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
376 # # degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
377 #
378 # # corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
379 #
380 # # self.redis.hset(self.node_prefix+str(node), 'corrected_clustering_coefficient', corrected_cc)
381 # # self.redis.zadd('corrected_clustering_coefficient', corrected_cc, str(node))
382 #
383 # def correct_average_neighbor_degree(self,node):
384 # avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
385 #
386 # neighbors = self.graph.neighbors(node)
387 # number_of_neighbors = float(len(neighbors))
388 # neighbor_degrees = []
389 # for neighbor in neighbors:
390 # neighbor_degrees.append(self.graph.degree(neighbor))
391 #
392 # #using numpy median and standard deviation implementation
393 # numpy_neighbor_degrees = np.array(neighbor_degrees)
394 # median = np.median(numpy_neighbor_degrees)
395 # standard_deviation = np.std(numpy_neighbor_degrees)
396 #
397 # if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
398 # return avgnd
399 # else:
400 # return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
401 #
402 #
403 # def correct_iterated_average_neighbor_degree(self, node):
404 # avgnd = float(self.redis.hget(self.node_prefix+str(node), 'iterated_average_neighbor_degree'))
405 #
406 # first_level_neighbors = self.graph.neighbors(node)
407 # second_level_neighbors = []
408 #
409 # # get all two-hop nodes
410 # for first_level_neighbor in first_level_neighbors:
411 # current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
412 # second_level_neighbors.extend(current_second_level_neighbors)
413 #
414 # #remove one-hop neighbors and self
415 # relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
416 #
417 # number_of_nodes = len(relevant_nodes)
418 # node_degrees = []
419 # for rel_node in relevant_nodes:
420 # node_degrees.append(self.graph.degree(rel_node))
421 #
422 # numpy_node_degrees = np.array(node_degrees)
423 # median = np.median(numpy_node_degrees)
424 # standard_deviation = np.std(numpy_node_degrees)
425 #
426 # if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
427 # return avgnd
428 # else:
429 # return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
430 #
431 #
432 #
433 #
434 #################
435 ##advanced scores
436 #################
437 #
438 # def urs_clustering_coefficient_modification(self):
439 #
440 # #caching of values
441 # all_ccs_normalized = dict(self.redis.zrange('corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
442 # all_urs = dict(self.redis.zrange('unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
443 #
444 # urs_percentile_10 = np.percentile(all_urs.values(), 10)
445 # urs_percentile_90 = np.percentile(all_urs.values(), 90)
446 #
447 # for node in self.nodes:
448 # #cc_normalized = float(self.redis.hget(self.node_prefix+str(node),'corrected_clustering_coefficient'+self.normalization_suffix))
449 # #urs = float(self.redis.hget(self.node_prefix+str(node),'unified_risk_score'))
450 #
451 # cc_normalized = all_ccs_normalized[str(node)]
452 # urs = all_urs[str(node)]
453 #
454 #
455 # if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
456 # if (cc_normalized >= 0.25):
457 # advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
458 # else:
459 # advanced_unified_risk_score = urs
460 # else:
461 # advanced_unified_risk_score = urs
462 #
463 # #save for node
464 # self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
465 # #save for metric
466 # self.redis.zadd('advanced_unified_risk_score', advanced_unified_risk_score, str(node))
467
468 #############
469 # statistics
470 #############
471
472 def calculate_statistics(self):
473 for metric in self.base_metrics:
474 #absolute and normalized
475 statistics.calculate_statistics(self, metric, self.metric_prefix+metric)
476 statistics.calculate_statistics(self, metric+self.normalization_suffix, self.metric_prefix+metric+self.normalization_suffix)
219 477
220 def clustering_coefficient(self,node):
221 #in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
222 #NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
223 # in this case, just returning nx.clustering(self.graph, node) might be better
224 if not hasattr(self, 'all_clustering_coefficients'):
225 self.all_clustering_coefficients = nx.clustering(self.graph)
226
227 #get the actual value from the pre-calculated hash
228 return self.all_clustering_coefficients[node]
229
230 def degree(self, node):
231 return self.graph.degree(node)
232
233
234 def average_neighbor_degree(self,node):
235 # same caching technique as in self.clustering_coefficient
236 # might also break for very large graphs
237 # nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
238
239 if not hasattr(self, 'all_average_neighbor_degrees'):
240 self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
241 return self.all_average_neighbor_degrees[node]
242
243 def iterated_average_neighbor_degree(self, node):
244
245 first_level_neighbors = self.graph.neighbors(node)
246 second_level_neighbors = []
247
248 # get all two-hop nodes
249 for first_level_neighbor in first_level_neighbors:
250 current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
251 second_level_neighbors.extend(current_second_level_neighbors)
252
253 #remove one-hop nodes and self
254 relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
255
256 degree_sum = 0
257 for relevant_node in relevant_nodes:
258 degree_sum += self.graph.degree(relevant_node)
259
260 return float(degree_sum)/float(len(relevant_nodes))
261
262 def betweenness_centrality(self, node):
263 if not hasattr(self, 'all_betweenness_centralities'):
264 self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
265 return self.all_betweenness_centralities[node]
266
267 def eccentricity(self, node):
268 if not hasattr(self, 'all_eccentricities'):
269 self.all_eccentricities = nx.eccentricity(self.graph)
270 return self.all_eccentricities[node]
271
272 def average_shortest_path_length(self, node):
273 # caching average_shortest_path_length for all nodes at one failed
274 # already switched to single calculation
275
276 #get all shortest path lengths
277 all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
278
279 #calculate average
280 sum_of_lengths = 0
281 for target in all_shortest_path_lengths_for_node:
282 sum_of_lengths += all_shortest_path_lengths_for_node[target]
283
284 return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
285
286
287 #############
288 # corrections
289 #############
290 def correct_clustering_coefficient(self,node):
291 clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
292 degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
293 corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
294
295 return corrected_cc
296
297 #def correct_clustering_coefficient(self):
298
299 # for node in self.nodes:
300 # clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
301 # degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
302
303 # corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
304
305 # self.redis.hset(self.node_prefix+str(node), 'corrected_clustering_coefficient', corrected_cc)
306 # self.redis.zadd('corrected_clustering_coefficient', corrected_cc, str(node))
307
308 def correct_average_neighbor_degree(self,node):
309 avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
310
311 neighbors = self.graph.neighbors(node)
312 number_of_neighbors = float(len(neighbors))
313 neighbor_degrees = []
314 for neighbor in neighbors:
315 neighbor_degrees.append(self.graph.degree(neighbor))
316
317 #using numpy median and standard deviation implementation
318 numpy_neighbor_degrees = np.array(neighbor_degrees)
319 median = np.median(numpy_neighbor_degrees)
320 standard_deviation = np.std(numpy_neighbor_degrees)
321
322 if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
323 return avgnd
324 else:
325 return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
326
327 #return 18
328
329 def correct_iterated_average_neighbor_degree(self, node):
330 avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
331
332 first_level_neighbors = self.graph.neighbors(node)
333 second_level_neighbors = []
334
335 # get all two-hop nodes
336 for first_level_neighbor in first_level_neighbors:
337 current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
338 second_level_neighbors.extend(current_second_level_neighbors)
339
340 #remove one-hop neighbors and self
341 relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
342
343 number_of_nodes = len(relevant_nodes)
344 node_degrees = []
345 for rel_node in relevant_nodes:
346 node_degrees.append(self.graph.degree(rel_node))
347
348 numpy_node_degrees = np.array(node_degrees)
349 median = np.median(numpy_node_degrees)
350 standard_deviation = np.std(numpy_node_degrees)
351
352 if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
353 return avgnd
354 else:
355 return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
356
357
358
359
360 ################
361 #advanced scores
362 ################
363
364 def urs_clustering_coefficient_modification(self):
365
366 #caching of values
367 all_ccs_normalized = dict(self.redis.zrange('corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
368 all_urs = dict(self.redis.zrange('unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
369
370 urs_percentile_10 = np.percentile(all_urs.values(), 10)
371 urs_percentile_90 = np.percentile(all_urs.values(), 90)
372
373 for node in self.nodes:
374 #cc_normalized = float(self.redis.hget(self.node_prefix+str(node),'corrected_clustering_coefficient'+self.normalization_suffix))
375 #urs = float(self.redis.hget(self.node_prefix+str(node),'unified_risk_score'))
376
377 cc_normalized = all_ccs_normalized[str(node)]
378 urs = all_urs[str(node)]
379
380
381
382 if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
383 if (cc_normalized >= 0.25):
384 advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
385 else:
386 advanced_unified_risk_score = urs
387 else:
388 advanced_unified_risk_score = urs
478 for advanced_metric in self.advanced_metrics:
479 #absolute and normalized
480 statistics.calculate_statistics(self, advanced_metric, self.metric_prefix+advanced_metric)
481 statistics.calculate_statistics(self, advanced_metric+self.normalization_suffix, self.metric_prefix+advanced_metric+self.normalization_suffix)
389 482
390 #save for node
391 self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
392 #save for metric
393 self.redis.zadd('advanced_unified_risk_score', advanced_unified_risk_score, str(node))
483 for score in self.scores:
484 statistics.calculate_statistics(self, score, self.score_prefix+score)
394 485
395
486 for advanced_score in self.advanced_scores:
487 statistics.calculate_statistics(self, advanced_score, self.score_prefix+advanced_score)
488
489 statistics.calculate_correlations(self)
490
491 #
492 #
493 # def calculate_statistics_for_absolute_values(self,metric):
494 # all_values = dict(self.redis.zrange(metric, 0, -1, withscores=True, score_cast_func=float)).values()
495 # min_value = np.min(np.array(all_values))
496 # max_value = np.max(all_values)
497 #
498 # average = np.average(all_values)
499 # median = np.median(all_values)
500 # standard_deviation = np.std(all_values)
501 #
502 # self.redis.hset(self.statistics_prefix+str(metric), 'min', min_value)
503 # self.redis.hset(self.statistics_prefix+str(metric), 'max', max_value)
504 # self.redis.hset(self.statistics_prefix+str(metric), 'average', average)
505 # self.redis.hset(self.statistics_prefix+str(metric), 'median', median)
506 # self.redis.hset(self.statistics_prefix+str(metric), 'standard_deviation', standard_deviation)
507 #
508 # def calculate_statistics_for_normalized_values(self,metric):
509 # all_values = dict(self.redis.zrange(metric+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float)).values()
510 #
511 # min_value = np.min(all_values)
512 # max_value = np.max(all_values)
513 #
514 # average = np.average(all_values)
515 # median = np.median(all_values)
516 # standard_deviation = np.std(all_values)
517 #
518 # self.redis.hset(self.statistics_prefix+str(metric)+self.normalization_suffix, 'min', min_value)
519 # self.redis.hset(self.statistics_prefix+str(metric)+self.normalization_suffix, 'max', max_value)
520 # self.redis.hset(self.statistics_prefix+str(metric)+self.normalization_suffix, 'average', average)
521 # self.redis.hset(self.statistics_prefix+str(metric)+self.normalization_suffix, 'median', median)
522 # self.redis.hset(self.statistics_prefix+str(metric)+self.normalization_suffix, 'standard_deviation', standard_deviation)
523 #
524 #
525 # def calculate_correlations(self):
526 # m = self.metrics.keys()
527 # c = self.corrections.keys()
528 #
529 # metrics = m + c
530 #
531 # correlations = {}
532 # for metric1 in metrics:
533 # correlations[metric1] = {}
534 # for metric2 in metrics:
535 # correlations[metric1][metric2] = (0,0)
536 # if metric1 == metric2:
537 # correlations[metric1][metric2] = (1,0)
538 # continue
539 #
540 # dict_metric1 = dict(self.redis.zrange(metric1, 0, -1, withscores=True, score_cast_func=float))
541 # dict_metric2 = dict(self.redis.zrange(metric2, 0, -1, withscores=True, score_cast_func=float))
542 # values_metric1 = []
543 # values_metric2 = []
544 #
545 # for key in sorted(dict_metric1.iterkeys()):
546 # values_metric1.append(dict_metric1[key])
547 #
548 # for key in sorted(dict_metric2.iterkeys()):
549 # values_metric2.append(dict_metric2[key])
550 #
551 # correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
552 #
553 # values_metric1 = []
554 # values_metric2 = []
555 #
556 # for source in correlations:
557 # for target in correlations[source]:
558 # self.redis.hset("correlations:"+source+":"+target, "correlation", correlations[source][target][0])
559 # self.redis.hset("correlations:"+source+":"+target, "confidence", correlations[source][target][1])
File metric_calculator.pyc deleted (index ff74136..0000000)
File profiling.py deleted (index 798f46e..0000000)
1 from metric_calculator import MetricCalculator
2 import networkx as nx
3 import redis as rd
4
5 import cProfile, pstats, StringIO
6
7 redis = rd.StrictRedis(host='localhost', port=6379, db=0)
8
9 #random_runs = [[100,0.2],[100,0.3]]
10 random_runs = [[1000,0.05],[1000,0.1],[1000,0.2],[10000,0.3],[1000,0.4],[2000,0.2],[3000,0.2],[4000,0.2],[5000,0.2],[6000,0.2]]
11
12
13 for graph_configuration in random_runs:
14
15 number_of_nodes = graph_configuration[0]
16 probability_of_connection = graph_configuration[1]
17
18 graph = nx.fast_gnp_random_graph(number_of_nodes,probability_of_connection,seed=1)
19
20 nodes = nx.nodes(graph)
21 #barabasi_albert_graph(n, m, seed=None)[source]
22
23 if not nx.is_connected(graph):
24 print "not connected"
25 sys.exit(-1)
26
27 redis.flushdb()
28 redis.sadd('all_nodes', *nodes)
29
30 mc = MetricCalculator(graph)
31
32 pr = cProfile.Profile()
33 pr.enable()
34
35 mc.start()
36
37 s = StringIO.StringIO()
38 sortby = 'cumulative'
39 ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
40 ps.print_stats()
41
42 outfile = open('auto_profiling_output_'+str(number_of_nodes)+'_'+str(probability_of_connection)+'.txt', 'w')
43 outfile.write(s.getvalue())
File run.py deleted (index 682220f..0000000)
1 #!/usr/bin/env python
2
3 from file_importer import FileImporter
4 from metric_calculator import MetricCalculator
5
6 import cProfile, pstats, StringIO
7
8 import networkx as nx
9 import redis as rd
10
11 # start import
12 #fi = FileImporter('data/Dataset_2012.txt')
13 #fi = FileImporter('data/test_dataset.txt')
14 #graph = fi.read()
15
16 #print "Nodes:"
17 #print graph.number_of_nodes()
18 #print "Edges:"
19 #print graph.number_of_edges()
20
21 redis = rd.StrictRedis(host='localhost', port=6379, db=0)
22 redis.flushdb()
23 all_nodes = range(1,100)
24 graph = nx.fast_gnp_random_graph(100,0.15,seed=1)
25 redis.sadd('all_nodes', *all_nodes)
26
27 mc = MetricCalculator(graph)
28
29 pr = cProfile.Profile()
30 pr.enable()
31
32 mc.start()
33
34 s = StringIO.StringIO()
35 sortby = 'cumulative'
36 ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
37 ps.print_stats()
38
39 outfile = open('profiling_run_result.txt', 'w')
40 outfile.write(s.getvalue())
File test.py deleted (index 853e9c1..0000000)
1 #redis test
2 import redis
3 r = redis.StrictRedis(host='localhost', port=6379, db=0)
4
5 nodes = [1,2,3,4,5,6,7,8,9]
6 for node in nodes:
7 print str(node)
8 print r.get('node:'+str(node)+':degree')
9 print r.get('node:'+str(node)+':average_neighbor_degree')
10 print r.get('node:'+str(node)+':eccentricity')
11 print r.get('node:'+str(node)+':betweenness_centrality')
12 print r.get('node:'+str(node)+':clustering_coefficient')
13 print r.get('node:'+str(node)+':average_shortest_path_length')
14
15
16
17 print r.get('all_nodes').strip('[]').split(', ').type()
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/coria/coria-backend

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/coria/coria-backend

Clone this repository using git:
git clone git://git.rocketgit.com/user/coria/coria-backend

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main