File advancedscores.py added (mode: 100644) (index 0000000..489636a) |
|
1 |
|
# advancedscores.py
|
|
2 |
|
import numpy as np
|
|
3 |
|
|
|
4 |
|
################
|
|
5 |
|
#advanced scores
|
|
6 |
|
################
|
|
7 |
|
|
|
8 |
|
def adv_unified_risk_score(self):
|
|
9 |
|
|
|
10 |
|
#caching of all values in dictionaries
|
|
11 |
|
all_ccs_normalized = dict(self.redis.zrange(self.metric_prefix+'corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
|
|
12 |
|
all_urs = dict(self.redis.zrange(self.score_prefix+'unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
|
|
13 |
|
|
|
14 |
|
urs_percentile_10 = np.percentile(all_urs.values(), 10)
|
|
15 |
|
urs_percentile_90 = np.percentile(all_urs.values(), 90)
|
|
16 |
|
|
|
17 |
|
for node in self.nodes:
|
|
18 |
|
cc_normalized = all_ccs_normalized[str(node)]
|
|
19 |
|
urs = all_urs[str(node)]
|
|
20 |
|
|
|
21 |
|
|
|
22 |
|
if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
|
|
23 |
|
if (cc_normalized >= 0.25):
|
|
24 |
|
advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
|
|
25 |
|
else:
|
|
26 |
|
advanced_unified_risk_score = urs
|
|
27 |
|
else:
|
|
28 |
|
advanced_unified_risk_score = urs
|
|
29 |
|
|
|
30 |
|
#save for node
|
|
31 |
|
self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
|
|
32 |
|
#save for score
|
|
33 |
|
self.redis.zadd(self.score_prefix+'advanced_unified_risk_score', advanced_unified_risk_score, str(node)) |
File config.py added (mode: 100644) (index 0000000..d4c8c5e) |
|
1 |
|
#config.py
|
|
2 |
|
import metrics
|
|
3 |
|
import normalizations
|
|
4 |
|
import advancedscores
|
|
5 |
|
|
|
6 |
|
node_index_key = 'all_nodes'
|
|
7 |
|
metric_index_key = 'all_metrics'
|
|
8 |
|
score_index_key = 'all_scores'
|
|
9 |
|
|
|
10 |
|
node_neighbors_prefix = 'node_neighbors:'
|
|
11 |
|
node_prefix = 'node_metrics:'
|
|
12 |
|
metric_prefix = 'metric:'
|
|
13 |
|
score_prefix = 'score:'
|
|
14 |
|
statistics_prefix = 'statistics:'
|
|
15 |
|
|
|
16 |
|
normalization_suffix = '_normalized'
|
|
17 |
|
|
|
18 |
|
# definition of all base metrics for which absolute values will be calculcated for each node in the first step
|
|
19 |
|
# key is the name of the metric and value is the implemented method which exposes the required interface
|
|
20 |
|
# interface: each method takes the node as the single parameter, performs the necessary calculation and
|
|
21 |
|
# returns a float containing the value for the specified node
|
|
22 |
|
|
|
23 |
|
base_metrics = { 'clustering_coefficient' : metrics.clustering_coefficient,
|
|
24 |
|
'degree' : metrics.degree,
|
|
25 |
|
'average_neighbor_degree' : metrics.average_neighbor_degree,
|
|
26 |
|
'iterated_average_neighbor_degree': metrics.iterated_average_neighbor_degree,
|
|
27 |
|
'betweenness_centrality' : metrics.betweenness_centrality,
|
|
28 |
|
'eccentricity' : metrics.eccentricity,
|
|
29 |
|
'average_shortest_path_length' : metrics.average_shortest_path_length
|
|
30 |
|
}
|
|
31 |
|
|
|
32 |
|
|
|
33 |
|
# some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
|
|
34 |
|
# key is the metric name and value the method for correction
|
|
35 |
|
|
|
36 |
|
advanced_metrics = {'corrected_clustering_coefficient' : metrics.correct_clustering_coefficient,
|
|
37 |
|
'corrected_average_neighbor_degree' : metrics.correct_average_neighbor_degree,
|
|
38 |
|
'corrected_iterated_average_neighbor_degree': metrics.correct_iterated_average_neighbor_degree}
|
|
39 |
|
|
|
40 |
|
|
|
41 |
|
# for every metric, a normalization method has to be specified
|
|
42 |
|
# key is the name of the metric and value is the normalization method which also has to expose the required interface
|
|
43 |
|
# interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
|
|
44 |
|
# the method itself shall access the data which is required for normalization from the redis instance
|
|
45 |
|
# and the corresponding keys/values for the specified metric
|
|
46 |
|
# it shall then loop over all nodes and calculate the normalized value for the node and the metric
|
|
47 |
|
# afterwards it should save the result to redis using "metric_name_normalized" as the key
|
|
48 |
|
# the result is stored inside the node's hash for metrics
|
|
49 |
|
|
|
50 |
|
# also needs to include corrected metrics with their respective names
|
|
51 |
|
#
|
|
52 |
|
normalization_methods = { 'clustering_coefficient' : normalizations.min_max,
|
|
53 |
|
'corrected_clustering_coefficient' : normalizations.min_max,
|
|
54 |
|
'degree' : normalizations.min_max,
|
|
55 |
|
'average_neighbor_degree' : normalizations.min_max,
|
|
56 |
|
'corrected_average_neighbor_degree' : normalizations.min_max,
|
|
57 |
|
'iterated_average_neighbor_degree' : normalizations.min_max,
|
|
58 |
|
'corrected_iterated_average_neighbor_degree': normalizations.min_max,
|
|
59 |
|
'betweenness_centrality' : normalizations.min_max,
|
|
60 |
|
'eccentricity' : normalizations.max_min,
|
|
61 |
|
'average_shortest_path_length' : normalizations.max_min
|
|
62 |
|
}
|
|
63 |
|
|
|
64 |
|
|
|
65 |
|
# the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
|
|
66 |
|
# such scores can easily be defined here
|
|
67 |
|
# note: names are not methods but redis keys
|
|
68 |
|
|
|
69 |
|
scores = {'unified_risk_score': { 'degree': 0.25,
|
|
70 |
|
'corrected_average_neighbor_degree': 0.15,
|
|
71 |
|
'corrected_iterated_average_neighbor_degree': 0.1,
|
|
72 |
|
'betweenness_centrality': 0.25,
|
|
73 |
|
'eccentricity': 0.125,
|
|
74 |
|
'average_shortest_path_length': 0.125}
|
|
75 |
|
}
|
|
76 |
|
|
|
77 |
|
|
|
78 |
|
# other scores might require a more sophisticated algorithm to be calculated
|
|
79 |
|
# such scores need to be added here and implemented like the example below
|
|
80 |
|
|
|
81 |
|
advanced_scores = {'advanced_unified_risk_score': advancedscores.adv_unified_risk_score} |
File metrics.py added (mode: 100644) (index 0000000..d0b9c8e) |
|
1 |
|
#metrics.py
|
|
2 |
|
import networkx as nx
|
|
3 |
|
import numpy as np
|
|
4 |
|
|
|
5 |
|
def clustering_coefficient(self,node):
|
|
6 |
|
#in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
|
|
7 |
|
#NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
|
|
8 |
|
# in this case, just returning nx.clustering(self.graph, node) might be better
|
|
9 |
|
if not hasattr(self, 'all_clustering_coefficients'):
|
|
10 |
|
self.all_clustering_coefficients = nx.clustering(self.graph)
|
|
11 |
|
|
|
12 |
|
#get the actual value from the pre-calculated hash
|
|
13 |
|
return self.all_clustering_coefficients[node]
|
|
14 |
|
|
|
15 |
|
def degree(self, node):
|
|
16 |
|
return self.graph.degree(node)
|
|
17 |
|
|
|
18 |
|
|
|
19 |
|
def average_neighbor_degree(self,node):
|
|
20 |
|
# same caching technique as in self.clustering_coefficient
|
|
21 |
|
# might also break for very large graphs
|
|
22 |
|
# nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
|
|
23 |
|
|
|
24 |
|
if not hasattr(self, 'all_average_neighbor_degrees'):
|
|
25 |
|
self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
|
|
26 |
|
return self.all_average_neighbor_degrees[node]
|
|
27 |
|
|
|
28 |
|
def iterated_average_neighbor_degree(self, node):
|
|
29 |
|
|
|
30 |
|
first_level_neighbors = self.graph.neighbors(node)
|
|
31 |
|
second_level_neighbors = []
|
|
32 |
|
|
|
33 |
|
# get all two-hop nodes
|
|
34 |
|
for first_level_neighbor in first_level_neighbors:
|
|
35 |
|
current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
|
|
36 |
|
second_level_neighbors.extend(current_second_level_neighbors)
|
|
37 |
|
|
|
38 |
|
#remove one-hop nodes and self
|
|
39 |
|
relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
|
|
40 |
|
|
|
41 |
|
degree_sum = 0
|
|
42 |
|
for relevant_node in relevant_nodes:
|
|
43 |
|
degree_sum += self.graph.degree(relevant_node)
|
|
44 |
|
|
|
45 |
|
return float(degree_sum)/float(len(relevant_nodes))
|
|
46 |
|
|
|
47 |
|
def betweenness_centrality(self, node):
|
|
48 |
|
if not hasattr(self, 'all_betweenness_centralities'):
|
|
49 |
|
self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
|
|
50 |
|
return self.all_betweenness_centralities[node]
|
|
51 |
|
|
|
52 |
|
def eccentricity(self, node):
|
|
53 |
|
if not hasattr(self, 'all_eccentricities'):
|
|
54 |
|
self.all_eccentricities = nx.eccentricity(self.graph)
|
|
55 |
|
return self.all_eccentricities[node]
|
|
56 |
|
|
|
57 |
|
def average_shortest_path_length(self, node):
|
|
58 |
|
# caching average_shortest_path_length for all nodes at one failed
|
|
59 |
|
# already switched to single calculation
|
|
60 |
|
|
|
61 |
|
#get all shortest path lengths
|
|
62 |
|
all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
|
|
63 |
|
|
|
64 |
|
#calculate average
|
|
65 |
|
sum_of_lengths = 0
|
|
66 |
|
for target in all_shortest_path_lengths_for_node:
|
|
67 |
|
sum_of_lengths += all_shortest_path_lengths_for_node[target]
|
|
68 |
|
|
|
69 |
|
return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
|
|
70 |
|
|
|
71 |
|
|
|
72 |
|
#############
|
|
73 |
|
# advanced metrics
|
|
74 |
|
#############
|
|
75 |
|
def correct_clustering_coefficient(self,node):
|
|
76 |
|
clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
|
|
77 |
|
degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
|
|
78 |
|
corrected_cc = clustering_coefficient + (degree * clustering_coefficient) / float(4)
|
|
79 |
|
return corrected_cc
|
|
80 |
|
|
|
81 |
|
def correct_average_neighbor_degree(self,node):
|
|
82 |
|
avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
|
|
83 |
|
|
|
84 |
|
neighbors = self.graph.neighbors(node)
|
|
85 |
|
number_of_neighbors = float(len(neighbors))
|
|
86 |
|
neighbor_degrees = []
|
|
87 |
|
for neighbor in neighbors:
|
|
88 |
|
neighbor_degrees.append(self.graph.degree(neighbor))
|
|
89 |
|
|
|
90 |
|
#using numpy median and standard deviation implementation
|
|
91 |
|
numpy_neighbor_degrees = np.array(neighbor_degrees)
|
|
92 |
|
median = np.median(numpy_neighbor_degrees)
|
|
93 |
|
standard_deviation = np.std(numpy_neighbor_degrees)
|
|
94 |
|
|
|
95 |
|
if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
|
|
96 |
|
return avgnd
|
|
97 |
|
else:
|
|
98 |
|
return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
|
|
99 |
|
|
|
100 |
|
|
|
101 |
|
def correct_iterated_average_neighbor_degree(self, node):
|
|
102 |
|
avgnd = float(self.redis.hget(self.node_prefix+str(node), 'iterated_average_neighbor_degree'))
|
|
103 |
|
|
|
104 |
|
first_level_neighbors = self.graph.neighbors(node)
|
|
105 |
|
second_level_neighbors = []
|
|
106 |
|
|
|
107 |
|
# get all two-hop nodes
|
|
108 |
|
for first_level_neighbor in first_level_neighbors:
|
|
109 |
|
current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
|
|
110 |
|
second_level_neighbors.extend(current_second_level_neighbors)
|
|
111 |
|
|
|
112 |
|
#remove one-hop neighbors and self
|
|
113 |
|
relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
|
|
114 |
|
|
|
115 |
|
number_of_nodes = len(relevant_nodes)
|
|
116 |
|
node_degrees = []
|
|
117 |
|
for rel_node in relevant_nodes:
|
|
118 |
|
node_degrees.append(self.graph.degree(rel_node))
|
|
119 |
|
|
|
120 |
|
numpy_node_degrees = np.array(node_degrees)
|
|
121 |
|
median = np.median(numpy_node_degrees)
|
|
122 |
|
standard_deviation = np.std(numpy_node_degrees)
|
|
123 |
|
|
|
124 |
|
if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
|
|
125 |
|
return avgnd
|
|
126 |
|
else:
|
|
127 |
|
return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
|
|
128 |
|
|
|
129 |
|
|
File normalizations.py added (mode: 100644) (index 0000000..a959a8c) |
|
1 |
|
#normalizations.py
|
|
2 |
|
def min_max(self,metric_name):
|
|
3 |
|
#perform min max normalization of specified metric for all nodes
|
|
4 |
|
#min_max normalization
|
|
5 |
|
#get min and max from redis
|
|
6 |
|
x_min = self.redis.zrange(self.metric_prefix+metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
|
|
7 |
|
x_max = self.redis.zrange(self.metric_prefix+metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
|
|
8 |
|
|
|
9 |
|
#print x_min
|
|
10 |
|
#print x_max
|
|
11 |
|
|
|
12 |
|
for node in self.nodes:
|
|
13 |
|
if x_min == x_max:
|
|
14 |
|
x_normalized = 1.0
|
|
15 |
|
else:
|
|
16 |
|
x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
|
|
17 |
|
x_normalized = (x - x_min) / (x_max - x_min)
|
|
18 |
|
|
|
19 |
|
#store value for node and metric
|
|
20 |
|
self.redis.zadd(self.metric_prefix+metric_name+self.normalization_suffix, x_normalized, str(node))
|
|
21 |
|
self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
|
|
22 |
|
|
|
23 |
|
#max min normalization
|
|
24 |
|
def max_min(self,metric_name):
|
|
25 |
|
x_min = self.redis.zrange(self.metric_prefix+metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
|
|
26 |
|
x_max = self.redis.zrange(self.metric_prefix+metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
|
|
27 |
|
|
|
28 |
|
for node in self.nodes:
|
|
29 |
|
if x_min == x_max:
|
|
30 |
|
x_normalized = 1.0
|
|
31 |
|
else:
|
|
32 |
|
x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
|
|
33 |
|
x_normalized = (x_max - x) / (x_max - x_min)
|
|
34 |
|
|
|
35 |
|
#store value for node and metric
|
|
36 |
|
self.redis.zadd(self.metric_prefix+metric_name+self.normalization_suffix, x_normalized, str(node))
|
|
37 |
|
self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized) |
File pearson.py added (mode: 100644) (index 0000000..7a6cc1c) |
|
1 |
|
import redis as rd
|
|
2 |
|
import numpy as np
|
|
3 |
|
from scipy.stats import pearsonr
|
|
4 |
|
|
|
5 |
|
metrics = ['clustering_coefficient',
|
|
6 |
|
'degree',
|
|
7 |
|
'average_neighbor_degree',
|
|
8 |
|
'iterated_average_neighbor_degree',
|
|
9 |
|
'betweenness_centrality',
|
|
10 |
|
'eccentricity',
|
|
11 |
|
'average_shortest_path_length',
|
|
12 |
|
'corrected_clustering_coefficient',
|
|
13 |
|
'corrected_average_neighbor_degree',
|
|
14 |
|
'corrected_iterated_average_neighbor_degree']
|
|
15 |
|
|
|
16 |
|
rdb = rd.StrictRedis(host='localhost', port=6379, db=0)
|
|
17 |
|
|
|
18 |
|
|
|
19 |
|
correlations = {}
|
|
20 |
|
for metric1 in metrics:
|
|
21 |
|
correlations[metric1] = {}
|
|
22 |
|
for metric2 in metrics:
|
|
23 |
|
correlations[metric1][metric2] = (0,0)
|
|
24 |
|
if metric1 == metric2:
|
|
25 |
|
correlations[metric1][metric2] = (1,0)
|
|
26 |
|
continue
|
|
27 |
|
|
|
28 |
|
dict_metric1 = dict(rdb.zrange(metric1, 0, -1, withscores=True, score_cast_func=float))
|
|
29 |
|
dict_metric2 = dict(rdb.zrange(metric2, 0, -1, withscores=True, score_cast_func=float))
|
|
30 |
|
|
|
31 |
|
values_metric1 = []
|
|
32 |
|
values_metric2 = []
|
|
33 |
|
|
|
34 |
|
for key in sorted(dict_metric1.iterkeys()):
|
|
35 |
|
values_metric1.append(dict_metric1[key])
|
|
36 |
|
|
|
37 |
|
for key in sorted(dict_metric2.iterkeys()):
|
|
38 |
|
values_metric2.append(dict_metric2[key])
|
|
39 |
|
|
|
40 |
|
correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
|
|
41 |
|
|
|
42 |
|
for source in correlations:
|
|
43 |
|
for target in correlations[source]:
|
|
44 |
|
rdb.hset("correlations:"+source+":"+target, "correlation", correlations[source][target][0])
|
|
45 |
|
rdb.hset("correlations:"+source+":"+target, "confidence", correlations[source][target][1]) |
File statistics.py added (mode: 100644) (index 0000000..fb03eaa) |
|
1 |
|
#statistics.py
|
|
2 |
|
import redis as rd
|
|
3 |
|
import numpy as np
|
|
4 |
|
from scipy.stats import pearsonr
|
|
5 |
|
|
|
6 |
|
def calculate_statistics(self,metric,redis_key):
|
|
7 |
|
all_values = dict(self.redis.zrange(redis_key, 0, -1, withscores=True, score_cast_func=float)).values()
|
|
8 |
|
min_value = np.min(all_values)
|
|
9 |
|
max_value = np.max(all_values)
|
|
10 |
|
|
|
11 |
|
average = np.average(all_values)
|
|
12 |
|
median = np.median(all_values)
|
|
13 |
|
standard_deviation = np.std(all_values)
|
|
14 |
|
|
|
15 |
|
self.redis.hset(self.statistics_prefix+metric, 'min', min_value)
|
|
16 |
|
self.redis.hset(self.statistics_prefix+metric, 'max', max_value)
|
|
17 |
|
self.redis.hset(self.statistics_prefix+metric, 'average', average)
|
|
18 |
|
self.redis.hset(self.statistics_prefix+metric, 'median', median)
|
|
19 |
|
self.redis.hset(self.statistics_prefix+metric, 'standard_deviation', standard_deviation)
|
|
20 |
|
|
|
21 |
|
|
|
22 |
|
def calculate_correlations(self):
|
|
23 |
|
m = self.base_metrics.keys()
|
|
24 |
|
c = self.advanced_metrics.keys()
|
|
25 |
|
|
|
26 |
|
metrics = m + c
|
|
27 |
|
|
|
28 |
|
correlations = {}
|
|
29 |
|
for metric1 in metrics:
|
|
30 |
|
correlations[metric1] = {}
|
|
31 |
|
for metric2 in metrics:
|
|
32 |
|
correlations[metric1][metric2] = (0,0)
|
|
33 |
|
if metric1 == metric2:
|
|
34 |
|
correlations[metric1][metric2] = (1,0)
|
|
35 |
|
continue
|
|
36 |
|
|
|
37 |
|
dict_metric1 = dict(self.redis.zrange(self.metric_prefix+metric1, 0, -1, withscores=True, score_cast_func=float))
|
|
38 |
|
dict_metric2 = dict(self.redis.zrange(self.metric_prefix+metric2, 0, -1, withscores=True, score_cast_func=float))
|
|
39 |
|
values_metric1 = []
|
|
40 |
|
values_metric2 = []
|
|
41 |
|
|
|
42 |
|
for key in sorted(dict_metric1.iterkeys()):
|
|
43 |
|
values_metric1.append(dict_metric1[key])
|
|
44 |
|
|
|
45 |
|
for key in sorted(dict_metric2.iterkeys()):
|
|
46 |
|
values_metric2.append(dict_metric2[key])
|
|
47 |
|
|
|
48 |
|
correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
|
|
49 |
|
|
|
50 |
|
values_metric1 = []
|
|
51 |
|
values_metric2 = []
|
|
52 |
|
|
|
53 |
|
for source in correlations:
|
|
54 |
|
for target in correlations[source]:
|
|
55 |
|
self.redis.hset(self.statistics_prefix+"correlations:"+source+":"+target, "correlation", correlations[source][target][0])
|
|
56 |
|
self.redis.hset(self.statistics_prefix+"correlations:"+source+":"+target, "confidence", correlations[source][target][1]) |