File metric_calculator.py added (mode: 100644) (index 0000000..a573c4e) |
|
1 |
|
import networkx as nx
|
|
2 |
|
import redis as rd
|
|
3 |
|
import numpy as np
|
|
4 |
|
|
|
5 |
|
|
|
6 |
|
class MetricCalculator(object):
|
|
7 |
|
def __init__ (self, graph):
|
|
8 |
|
self.graph = graph
|
|
9 |
|
self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
|
|
10 |
|
self.nodes = nx.nodes(graph)
|
|
11 |
|
|
|
12 |
|
self.node_neighbors_prefix = 'node_neighbors:'
|
|
13 |
|
self.node_prefix = 'node_metrics:'
|
|
14 |
|
self.normalization_suffix = '_normalized'
|
|
15 |
|
|
|
16 |
|
# definition of all base metrics for which absolute values will be calculcated for each node in the first step
|
|
17 |
|
# key is the name of the metric and value is the implemented method which exposes the required interface
|
|
18 |
|
# interface: each method takes the node as the single parameter, performs the necessary calculation and
|
|
19 |
|
# returns a float containing the value for the specified node
|
|
20 |
|
|
|
21 |
|
self.metrics = { 'clustering_coefficient' : self.clustering_coefficient,
|
|
22 |
|
'degree' : self.degree,
|
|
23 |
|
'average_neighbor_degree' : self.average_neighbor_degree,
|
|
24 |
|
'iterated_average_neighbor_degree': self.iterated_average_neighbor_degree,
|
|
25 |
|
'betweenness_centrality' : self.betweenness_centrality,
|
|
26 |
|
'eccentricity' : self.eccentricity,
|
|
27 |
|
'average_shortest_path_length' : self.average_shortest_path_length
|
|
28 |
|
}
|
|
29 |
|
|
|
30 |
|
|
|
31 |
|
# for the frontend
|
|
32 |
|
# self.metric_names = {
|
|
33 |
|
# 'clustering_coefficient' : 'Clustering Coefficient',
|
|
34 |
|
# 'degree' : 'Node Degree',
|
|
35 |
|
# 'average_neighbor_degree' : 'Average Neighbor Node Degree',
|
|
36 |
|
# 'iterated_average_neighbor_degree': 'Iterated Average Neighbor Node Degree',
|
|
37 |
|
# 'betweenness_centrality' : 'Betweenness Centrality',
|
|
38 |
|
# 'eccentricity' : 'Node Eccentricity',
|
|
39 |
|
# 'average_shortest_path_length' : 'Average Shortest Path Length'
|
|
40 |
|
# }
|
|
41 |
|
|
|
42 |
|
|
|
43 |
|
# some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
|
|
44 |
|
# key is the metric name and value the method for correction
|
|
45 |
|
|
|
46 |
|
|
|
47 |
|
self.corrections = {'corrected_clustering_coefficient' : self.correct_clustering_coefficient,
|
|
48 |
|
'corrected_average_neighbor_degree' : self.correct_average_neighbor_degree,
|
|
49 |
|
'corrected_iterated_average_neighbor_degree': self.correct_iterated_average_neighbor_degree}
|
|
50 |
|
|
|
51 |
|
|
|
52 |
|
|
|
53 |
|
# for every metric, a normalization method has to be specified
|
|
54 |
|
# key is the name of the metric and value is the normalization method which also has to expose the required interface
|
|
55 |
|
# interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
|
|
56 |
|
# the method itself shall access the data which is required for normalization from the redis instance
|
|
57 |
|
# and the corresponding keys/values for the specified metric
|
|
58 |
|
# it shall then loop over all nodes and calculate the normalized value for the node and the metric
|
|
59 |
|
# afterwards it should save the result to redis using "metric_name_normalized" as the key
|
|
60 |
|
# the result is stored inside the node's hash for metrics
|
|
61 |
|
|
|
62 |
|
# also needs to include corrected metrics with their respective names
|
|
63 |
|
#
|
|
64 |
|
self.normalization_methods = { 'clustering_coefficient' : self.min_max_normalization,
|
|
65 |
|
'corrected_clustering_coefficient' : self.min_max_normalization,
|
|
66 |
|
'degree' : self.min_max_normalization,
|
|
67 |
|
'average_neighbor_degree' : self.min_max_normalization,
|
|
68 |
|
'corrected_average_neighbor_degree' : self.min_max_normalization,
|
|
69 |
|
'iterated_average_neighbor_degree' : self.min_max_normalization,
|
|
70 |
|
'corrected_iterated_average_neighbor_degree': self.min_max_normalization,
|
|
71 |
|
'betweenness_centrality' : self.min_max_normalization,
|
|
72 |
|
'eccentricity' : self.inverse_min_max_normalization,
|
|
73 |
|
'average_shortest_path_length' : self.inverse_min_max_normalization
|
|
74 |
|
}
|
|
75 |
|
|
|
76 |
|
|
|
77 |
|
# the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
|
|
78 |
|
# such scores can easily be defined here
|
|
79 |
|
|
|
80 |
|
#self.scores = ['unified_risk_score']
|
|
81 |
|
|
|
82 |
|
self.scores = {'unified_risk_score': { #'corrected_clustering_coefficient': 0.2,
|
|
83 |
|
'degree_normalized': 0.25,
|
|
84 |
|
'corrected_average_neighbor_degree_normalized': 0.15,
|
|
85 |
|
'corrected_iterated_average_neighbor_degree_normalized': 0.1,
|
|
86 |
|
'betweenness_centrality_normalized': 0.25,
|
|
87 |
|
'eccentricity_normalized': 0.125,
|
|
88 |
|
'average_shortest_path_length_normalized': 0.125}
|
|
89 |
|
}
|
|
90 |
|
|
|
91 |
|
|
|
92 |
|
# other scores might require a more sophisticated algorithm to be calculated
|
|
93 |
|
# such scores need to be added here and implemented like the example below
|
|
94 |
|
|
|
95 |
|
self.advanced_scores = {'advanced_unified_risk_score': self.urs_clustering_coefficient_modification}
|
|
96 |
|
|
|
97 |
|
|
|
98 |
|
|
|
99 |
|
def start(self):
|
|
100 |
|
|
|
101 |
|
self.store_neighbors()
|
|
102 |
|
self.calculate_metrics()
|
|
103 |
|
self.calculate_corrections()
|
|
104 |
|
self.normalize_metrics()
|
|
105 |
|
self.calculate_scores()
|
|
106 |
|
self.calculate_advanced_scores()
|
|
107 |
|
|
|
108 |
|
|
|
109 |
|
|
|
110 |
|
# write list of neighbors of each node to redis for navigation purposes in frontend
|
|
111 |
|
def store_neighbors(self):
|
|
112 |
|
for node in self.nodes:
|
|
113 |
|
node_neighbors = self.graph.neighbors(int(node))
|
|
114 |
|
self.redis.sadd(self.node_neighbors_prefix+str(node), *node_neighbors)
|
|
115 |
|
|
|
116 |
|
# loop through all defined metrics and call specified calculation method for each node
|
|
117 |
|
def calculate_metrics(self):
|
|
118 |
|
for metric_name in self.metrics:
|
|
119 |
|
metric_method = self.metrics[metric_name]
|
|
120 |
|
|
|
121 |
|
# loop through all nodes
|
|
122 |
|
for node in self.nodes:
|
|
123 |
|
|
|
124 |
|
# call calculation method of supplied metric for current node
|
|
125 |
|
node = int(node)
|
|
126 |
|
value = float(metric_method(node))
|
|
127 |
|
|
|
128 |
|
#store result in node values
|
|
129 |
|
self.redis.hset(self.node_prefix+str(node), metric_name, value)
|
|
130 |
|
|
|
131 |
|
#also store result to metric set
|
|
132 |
|
self.redis.zadd(metric_name, value, str(node))
|
|
133 |
|
|
|
134 |
|
# loop through all defined corrections and call specified calculation method
|
|
135 |
|
def calculate_corrections(self):
|
|
136 |
|
for correction_name in self.corrections:
|
|
137 |
|
correction_method = self.corrections[correction_name]
|
|
138 |
|
for node in self.nodes:
|
|
139 |
|
node = int(node)
|
|
140 |
|
value = float(correction_method(node))
|
|
141 |
|
|
|
142 |
|
#store result in node values
|
|
143 |
|
self.redis.hset(self.node_prefix+str(node), correction_name, value)
|
|
144 |
|
|
|
145 |
|
#also store result to metric set
|
|
146 |
|
self.redis.zadd(correction_name, value, str(node))
|
|
147 |
|
|
|
148 |
|
|
|
149 |
|
# loop through all defined normalizations and call respective normalization method
|
|
150 |
|
# no default normalizations for metrics not listed in the "normalization_methods" hash
|
|
151 |
|
def normalize_metrics(self):
|
|
152 |
|
for metric_name in self.normalization_methods:
|
|
153 |
|
normalization_method = self.normalization_methods[metric_name]
|
|
154 |
|
normalization_method(metric_name)
|
|
155 |
|
|
|
156 |
|
# normalizations
|
|
157 |
|
# min max normalization
|
|
158 |
|
def min_max_normalization(self,metric_name):
|
|
159 |
|
#perform min max normalization of specified metric for all nodes
|
|
160 |
|
#min_max normalization
|
|
161 |
|
#get min and max from redis
|
|
162 |
|
x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
|
|
163 |
|
x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
|
|
164 |
|
|
|
165 |
|
#print x_min
|
|
166 |
|
#print x_max
|
|
167 |
|
|
|
168 |
|
for node in self.nodes:
|
|
169 |
|
if x_min == x_max:
|
|
170 |
|
x_normalized = 1.0
|
|
171 |
|
else:
|
|
172 |
|
x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
|
|
173 |
|
x_normalized = (x - x_min) / (x_max - x_min)
|
|
174 |
|
|
|
175 |
|
#store value for node and metric
|
|
176 |
|
self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
|
|
177 |
|
self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
|
|
178 |
|
|
|
179 |
|
#max min normalization
|
|
180 |
|
def inverse_min_max_normalization(self,metric_name):
|
|
181 |
|
x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
|
|
182 |
|
x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
|
|
183 |
|
|
|
184 |
|
for node in self.nodes:
|
|
185 |
|
if x_min == x_max:
|
|
186 |
|
x_normalized = 1.0
|
|
187 |
|
else:
|
|
188 |
|
x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
|
|
189 |
|
x_normalized = (x_max - x) / (x_max - x_min)
|
|
190 |
|
|
|
191 |
|
#store value for node and metric
|
|
192 |
|
self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
|
|
193 |
|
self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
|
|
194 |
|
|
|
195 |
|
|
|
196 |
|
def calculate_scores(self):
|
|
197 |
|
for score_name in self.scores:
|
|
198 |
|
metrics_with_weights = self.scores[score_name]
|
|
199 |
|
|
|
200 |
|
for node in self.nodes:
|
|
201 |
|
score_value = 0.0
|
|
202 |
|
|
|
203 |
|
for metric in metrics_with_weights:
|
|
204 |
|
weight = self.scores[score_name][metric]
|
|
205 |
|
value = float(self.redis.hget(self.node_prefix+str(node),metric))
|
|
206 |
|
score_value += weight * value
|
|
207 |
|
|
|
208 |
|
self.redis.hset(self.node_prefix+str(node),score_name, score_value)
|
|
209 |
|
self.redis.zadd(score_name, score_value, str(node))
|
|
210 |
|
|
|
211 |
|
def calculate_advanced_scores(self):
|
|
212 |
|
for advanced_score in self.advanced_scores:
|
|
213 |
|
self.advanced_scores[advanced_score]()
|
|
214 |
|
|
|
215 |
|
|
|
216 |
|
###################################################
|
|
217 |
|
# actual metrics and corrections etc. below
|
|
218 |
|
# must return value which can be converted to float
|
|
219 |
|
|
|
220 |
|
def clustering_coefficient(self,node):
|
|
221 |
|
#in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
|
|
222 |
|
#NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
|
|
223 |
|
# in this case, just returning nx.clustering(self.graph, node) might be better
|
|
224 |
|
if not hasattr(self, 'all_clustering_coefficients'):
|
|
225 |
|
self.all_clustering_coefficients = nx.clustering(self.graph)
|
|
226 |
|
|
|
227 |
|
#get the actual value from the pre-calculated hash
|
|
228 |
|
return self.all_clustering_coefficients[node]
|
|
229 |
|
|
|
230 |
|
def degree(self, node):
|
|
231 |
|
return self.graph.degree(node)
|
|
232 |
|
|
|
233 |
|
|
|
234 |
|
def average_neighbor_degree(self,node):
|
|
235 |
|
# same caching technique as in self.clustering_coefficient
|
|
236 |
|
# might also break for very large graphs
|
|
237 |
|
# nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
|
|
238 |
|
|
|
239 |
|
if not hasattr(self, 'all_average_neighbor_degrees'):
|
|
240 |
|
self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
|
|
241 |
|
return self.all_average_neighbor_degrees[node]
|
|
242 |
|
|
|
243 |
|
def iterated_average_neighbor_degree(self, node):
|
|
244 |
|
|
|
245 |
|
first_level_neighbors = self.graph.neighbors(node)
|
|
246 |
|
second_level_neighbors = []
|
|
247 |
|
|
|
248 |
|
# get all two-hop nodes
|
|
249 |
|
for first_level_neighbor in first_level_neighbors:
|
|
250 |
|
current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
|
|
251 |
|
second_level_neighbors.extend(current_second_level_neighbors)
|
|
252 |
|
|
|
253 |
|
#remove one-hop nodes and self
|
|
254 |
|
relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
|
|
255 |
|
|
|
256 |
|
degree_sum = 0
|
|
257 |
|
for relevant_node in relevant_nodes:
|
|
258 |
|
degree_sum += self.graph.degree(relevant_node)
|
|
259 |
|
|
|
260 |
|
return float(degree_sum)/float(len(relevant_nodes))
|
|
261 |
|
|
|
262 |
|
def betweenness_centrality(self, node):
|
|
263 |
|
if not hasattr(self, 'all_betweenness_centralities'):
|
|
264 |
|
self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
|
|
265 |
|
return self.all_betweenness_centralities[node]
|
|
266 |
|
|
|
267 |
|
def eccentricity(self, node):
|
|
268 |
|
if not hasattr(self, 'all_eccentricities'):
|
|
269 |
|
self.all_eccentricities = nx.eccentricity(self.graph)
|
|
270 |
|
return self.all_eccentricities[node]
|
|
271 |
|
|
|
272 |
|
def average_shortest_path_length(self, node):
|
|
273 |
|
# caching average_shortest_path_length for all nodes at one failed
|
|
274 |
|
# already switched to single calculation
|
|
275 |
|
|
|
276 |
|
#get all shortest path lengths
|
|
277 |
|
all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
|
|
278 |
|
|
|
279 |
|
#calculate average
|
|
280 |
|
sum_of_lengths = 0
|
|
281 |
|
for target in all_shortest_path_lengths_for_node:
|
|
282 |
|
sum_of_lengths += all_shortest_path_lengths_for_node[target]
|
|
283 |
|
|
|
284 |
|
return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
|
|
285 |
|
|
|
286 |
|
|
|
287 |
|
#############
|
|
288 |
|
# corrections
|
|
289 |
|
#############
|
|
290 |
|
def correct_clustering_coefficient(self,node):
|
|
291 |
|
clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
|
|
292 |
|
degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
|
|
293 |
|
corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
|
|
294 |
|
|
|
295 |
|
return corrected_cc
|
|
296 |
|
|
|
297 |
|
#def correct_clustering_coefficient(self):
|
|
298 |
|
|
|
299 |
|
# for node in self.nodes:
|
|
300 |
|
# clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
|
|
301 |
|
# degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
|
|
302 |
|
|
|
303 |
|
# corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
|
|
304 |
|
|
|
305 |
|
# self.redis.hset(self.node_prefix+str(node), 'corrected_clustering_coefficient', corrected_cc)
|
|
306 |
|
# self.redis.zadd('corrected_clustering_coefficient', corrected_cc, str(node))
|
|
307 |
|
|
|
308 |
|
def correct_average_neighbor_degree(self,node):
|
|
309 |
|
avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
|
|
310 |
|
|
|
311 |
|
neighbors = self.graph.neighbors(node)
|
|
312 |
|
number_of_neighbors = float(len(neighbors))
|
|
313 |
|
neighbor_degrees = []
|
|
314 |
|
for neighbor in neighbors:
|
|
315 |
|
neighbor_degrees.append(self.graph.degree(neighbor))
|
|
316 |
|
|
|
317 |
|
#using numpy median and standard deviation implementation
|
|
318 |
|
numpy_neighbor_degrees = np.array(neighbor_degrees)
|
|
319 |
|
median = np.median(numpy_neighbor_degrees)
|
|
320 |
|
standard_deviation = np.std(numpy_neighbor_degrees)
|
|
321 |
|
|
|
322 |
|
if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
|
|
323 |
|
return avgnd
|
|
324 |
|
else:
|
|
325 |
|
return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
|
|
326 |
|
|
|
327 |
|
#return 18
|
|
328 |
|
|
|
329 |
|
def correct_iterated_average_neighbor_degree(self, node):
|
|
330 |
|
avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
|
|
331 |
|
|
|
332 |
|
first_level_neighbors = self.graph.neighbors(node)
|
|
333 |
|
second_level_neighbors = []
|
|
334 |
|
|
|
335 |
|
# get all two-hop nodes
|
|
336 |
|
for first_level_neighbor in first_level_neighbors:
|
|
337 |
|
current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
|
|
338 |
|
second_level_neighbors.extend(current_second_level_neighbors)
|
|
339 |
|
|
|
340 |
|
#remove one-hop neighbors and self
|
|
341 |
|
relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
|
|
342 |
|
|
|
343 |
|
number_of_nodes = len(relevant_nodes)
|
|
344 |
|
node_degrees = []
|
|
345 |
|
for rel_node in relevant_nodes:
|
|
346 |
|
node_degrees.append(self.graph.degree(rel_node))
|
|
347 |
|
|
|
348 |
|
numpy_node_degrees = np.array(node_degrees)
|
|
349 |
|
median = np.median(numpy_node_degrees)
|
|
350 |
|
standard_deviation = np.std(numpy_node_degrees)
|
|
351 |
|
|
|
352 |
|
if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
|
|
353 |
|
return avgnd
|
|
354 |
|
else:
|
|
355 |
|
return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
|
|
356 |
|
|
|
357 |
|
|
|
358 |
|
|
|
359 |
|
|
|
360 |
|
################
|
|
361 |
|
#advanced scores
|
|
362 |
|
################
|
|
363 |
|
|
|
364 |
|
def urs_clustering_coefficient_modification(self):
|
|
365 |
|
|
|
366 |
|
#caching of values
|
|
367 |
|
all_ccs_normalized = dict(self.redis.zrange('corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
|
|
368 |
|
all_urs = dict(self.redis.zrange('unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
|
|
369 |
|
|
|
370 |
|
urs_percentile_10 = np.percentile(all_urs.values(), 10)
|
|
371 |
|
urs_percentile_90 = np.percentile(all_urs.values(), 90)
|
|
372 |
|
|
|
373 |
|
for node in self.nodes:
|
|
374 |
|
#cc_normalized = float(self.redis.hget(self.node_prefix+str(node),'corrected_clustering_coefficient'+self.normalization_suffix))
|
|
375 |
|
#urs = float(self.redis.hget(self.node_prefix+str(node),'unified_risk_score'))
|
|
376 |
|
|
|
377 |
|
cc_normalized = all_ccs_normalized[str(node)]
|
|
378 |
|
urs = all_urs[str(node)]
|
|
379 |
|
|
|
380 |
|
|
|
381 |
|
|
|
382 |
|
if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
|
|
383 |
|
if (cc_normalized >= 0.25):
|
|
384 |
|
advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
|
|
385 |
|
else:
|
|
386 |
|
advanced_unified_risk_score = urs
|
|
387 |
|
else:
|
|
388 |
|
advanced_unified_risk_score = urs
|
|
389 |
|
|
|
390 |
|
#save for node
|
|
391 |
|
self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
|
|
392 |
|
#save for metric
|
|
393 |
|
self.redis.zadd('advanced_unified_risk_score', advanced_unified_risk_score, str(node))
|
|
394 |
|
|
|
395 |
|
|