File config.py added (mode: 100644) (index 0000000..8410e0b) |
|
1 |
|
#config.py
|
|
2 |
|
import metrics
|
|
3 |
|
import normalizations
|
|
4 |
|
import advancedscores
|
|
5 |
|
|
|
6 |
|
#redis keys for indexes and values
|
|
7 |
|
node_index_key = 'all_nodes'
|
|
8 |
|
metric_index_key = 'all_metrics'
|
|
9 |
|
score_index_key = 'all_scores'
|
|
10 |
|
|
|
11 |
|
node_neighbors_prefix = 'node_neighbors:'
|
|
12 |
|
node_prefix = 'node_metrics:'
|
|
13 |
|
metric_prefix = 'metric:'
|
|
14 |
|
score_prefix = 'score:'
|
|
15 |
|
statistics_prefix = 'statistics:'
|
|
16 |
|
|
|
17 |
|
normalization_suffix = '_normalized'
|
|
18 |
|
|
|
19 |
|
# definition of all base metrics for which absolute values will be calculcated for each node in the first step
|
|
20 |
|
# key is the name of the metric and value is the implemented method which exposes the required interface
|
|
21 |
|
# interface: each method takes the node as the single parameter, performs the necessary calculation and
|
|
22 |
|
# returns a float containing the value for the specified node
|
|
23 |
|
|
|
24 |
|
base_metrics = { 'clustering_coefficient' : metrics.clustering_coefficient,
|
|
25 |
|
'degree' : metrics.degree,
|
|
26 |
|
'average_neighbor_degree' : metrics.average_neighbor_degree,
|
|
27 |
|
'iterated_average_neighbor_degree': metrics.iterated_average_neighbor_degree,
|
|
28 |
|
# 'betweenness_centrality' : metrics.betweenness_centrality,
|
|
29 |
|
'betweenness_centrality_gt' : metrics.betweenness_centrality_gt,
|
|
30 |
|
# 'eccentricity' : metrics.eccentricity,
|
|
31 |
|
'average_shortest_path_length' : metrics.average_shortest_path_length
|
|
32 |
|
}
|
|
33 |
|
|
|
34 |
|
|
|
35 |
|
# some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
|
|
36 |
|
# key is the metric name and value the method for correction
|
|
37 |
|
|
|
38 |
|
advanced_metrics = {'corrected_clustering_coefficient' : metrics.correct_clustering_coefficient,
|
|
39 |
|
'corrected_average_neighbor_degree' : metrics.correct_average_neighbor_degree,
|
|
40 |
|
'corrected_iterated_average_neighbor_degree': metrics.correct_iterated_average_neighbor_degree}
|
|
41 |
|
|
|
42 |
|
|
|
43 |
|
# for every metric, a normalization method has to be specified
|
|
44 |
|
# key is the name of the metric and value is the normalization method which also has to expose the required interface
|
|
45 |
|
# interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
|
|
46 |
|
# the method itself shall access the data which is required for normalization from the redis instance
|
|
47 |
|
# and the corresponding keys/values for the specified metric
|
|
48 |
|
# it shall then loop over all nodes and calculate the normalized value for the node and the metric
|
|
49 |
|
# afterwards it should save the result to redis using "metric_name_normalized" as the key
|
|
50 |
|
# the result is stored inside the node's hash for metrics
|
|
51 |
|
|
|
52 |
|
# also needs to include corrected metrics with their respective names
|
|
53 |
|
#
|
|
54 |
|
normalization_methods = { 'clustering_coefficient' : normalizations.min_max,
|
|
55 |
|
'corrected_clustering_coefficient' : normalizations.min_max,
|
|
56 |
|
'degree' : normalizations.min_max,
|
|
57 |
|
'average_neighbor_degree' : normalizations.min_max,
|
|
58 |
|
'corrected_average_neighbor_degree' : normalizations.min_max,
|
|
59 |
|
'iterated_average_neighbor_degree' : normalizations.min_max,
|
|
60 |
|
'corrected_iterated_average_neighbor_degree': normalizations.min_max,
|
|
61 |
|
# 'betweenness_centrality' : normalizations.min_max,
|
|
62 |
|
'betweenness_centrality_gt' : normalizations.min_max,
|
|
63 |
|
# 'eccentricity' : normalizations.max_min,
|
|
64 |
|
'average_shortest_path_length' : normalizations.max_min
|
|
65 |
|
}
|
|
66 |
|
|
|
67 |
|
|
|
68 |
|
# the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
|
|
69 |
|
# such scores can easily be defined here
|
|
70 |
|
# note: names are not methods but redis keys
|
|
71 |
|
|
|
72 |
|
scores = {'unified_risk_score': { 'degree': 0.25,
|
|
73 |
|
'corrected_average_neighbor_degree': 0.15,
|
|
74 |
|
'corrected_iterated_average_neighbor_degree': 0.1,
|
|
75 |
|
'betweenness_centrality_gt': 0.25,
|
|
76 |
|
# 'eccentricity': 0.125,
|
|
77 |
|
'average_shortest_path_length': 0.25}
|
|
78 |
|
}
|
|
79 |
|
|
|
80 |
|
|
|
81 |
|
# other scores might require a more sophisticated algorithm to be calculated
|
|
82 |
|
# such scores need to be added here and implemented like the example below
|
|
83 |
|
|
|
84 |
|
advanced_scores = {'advanced_unified_risk_score': advancedscores.adv_unified_risk_score} |
File file_importer.py added (mode: 100644) (index 0000000..68b7ae9) |
|
1 |
|
import networkx as nx
|
|
2 |
|
import graph_tool.all as gt
|
|
3 |
|
|
|
4 |
|
class FileImporter(object):
|
|
5 |
|
def __init__(self,filename):
|
|
6 |
|
# initialize data file to parse and new empty graph
|
|
7 |
|
print ('Starting file importer!')
|
|
8 |
|
self.data_file = open(filename)
|
|
9 |
|
self.graph = nx.Graph()
|
|
10 |
|
self.graph_gt = gt.Graph(directed=False)
|
|
11 |
|
self.graph_gt_labels = self.graph_gt.new_vertex_property("double")
|
|
12 |
|
|
|
13 |
|
def read(self):
|
|
14 |
|
for line in self.data_file:
|
|
15 |
|
print("Parsing line",line)
|
|
16 |
|
self.parse_line(line)
|
|
17 |
|
return self.graph
|
|
18 |
|
# return {'graph':self.graph, 'graph_gt':self.graph_gt, 'graph_gt_labels':self.graph_gt_labels}
|
|
19 |
|
#self.graph,self.graph_gt,self.graph_gt_labels
|
|
20 |
|
|
|
21 |
|
def read_gt(self):
|
|
22 |
|
return {'graph_gt':self.graph_gt, 'graph_gt_labels':self.graph_gt_labels}
|
|
23 |
|
|
|
24 |
|
def parse_line(self, line):
|
|
25 |
|
# split each line on tabstop
|
|
26 |
|
# first field specifies the source node
|
|
27 |
|
# second field specifies the target node
|
|
28 |
|
|
|
29 |
|
fields = line.strip().split("\t")
|
|
30 |
|
from_node = int(fields[0])
|
|
31 |
|
to_node = int(fields[1])
|
|
32 |
|
|
|
33 |
|
# print('\n')
|
|
34 |
|
# print('From node is',from_node)
|
|
35 |
|
# print('To node is',to_node)
|
|
36 |
|
# add edge to the networkx graph
|
|
37 |
|
if (from_node <> to_node):
|
|
38 |
|
self.graph.add_edge(from_node, to_node)
|
|
39 |
|
# print('Network X graph has the following number of nodes',self.graph.number_of_nodes())
|
|
40 |
|
# print('Network X graph has the following number of edges',self.graph.number_of_edges())
|
|
41 |
|
|
|
42 |
|
|
|
43 |
|
|
|
44 |
|
#add edge to the graph_tool graph and create a property map of labels
|
|
45 |
|
#check if nodes are already present and create new ones if not
|
|
46 |
|
#temp = gt.Graph(directed=False)
|
|
47 |
|
#temp_name = temp.new_vertex_property("string")
|
|
48 |
|
temp = self.graph_gt
|
|
49 |
|
temp_name = self.graph_gt_labels
|
|
50 |
|
|
|
51 |
|
check = None
|
|
52 |
|
if (from_node <> to_node): #check if from_node is the same as to_node
|
|
53 |
|
index_from = gt.find_vertex(temp,temp_name,from_node)
|
|
54 |
|
# print('Index from is',index_from)
|
|
55 |
|
index_to = gt.find_vertex(temp,temp_name,to_node)
|
|
56 |
|
# print('Index to is',index_to)
|
|
57 |
|
if (index_from == [] and index_to == []):
|
|
58 |
|
# print('No idences are found')
|
|
59 |
|
c1 = temp.add_vertex()
|
|
60 |
|
temp_name[temp.vertex(c1)] = from_node
|
|
61 |
|
# print('Temp_name is now',temp_name[temp.vertex(c1)])
|
|
62 |
|
c2 = temp.add_vertex()
|
|
63 |
|
temp_name[temp.vertex(c2)] = to_node
|
|
64 |
|
# print('Temp_name is now',temp_name[temp.vertex(c2)])
|
|
65 |
|
if (index_from <> [] and index_to == []) :
|
|
66 |
|
# print('Index from is')
|
|
67 |
|
# print(index_from[0])
|
|
68 |
|
c1 = index_from[0]
|
|
69 |
|
#print('C1 is',c1)
|
|
70 |
|
c2 = temp.add_vertex()
|
|
71 |
|
#print('C2 is'),
|
|
72 |
|
#print(c2)
|
|
73 |
|
temp_name[temp.vertex(c2)] = to_node
|
|
74 |
|
# print('Temp_name is now',temp_name[temp.vertex(c2)])
|
|
75 |
|
if (index_to <> [] and index_from ==[]) :
|
|
76 |
|
# print('Index to is')
|
|
77 |
|
# print(index_to[0])
|
|
78 |
|
c1 = temp.add_vertex()
|
|
79 |
|
c2 = index_to[0]
|
|
80 |
|
temp_name[temp.vertex(c1)] = from_node
|
|
81 |
|
# print('Temp_name is now',temp_name[temp.vertex(c1)])
|
|
82 |
|
if (index_from <> [] and index_to <> []) :
|
|
83 |
|
# print('Both vertices found')
|
|
84 |
|
c1 = index_to[0]
|
|
85 |
|
c2 = index_from[0]
|
|
86 |
|
check = temp.edge(c1,c2) #check if the edge is already present
|
|
87 |
|
# print('Check is',check)
|
|
88 |
|
if (check == None):
|
|
89 |
|
# print("Adding edge between",c1,"and",c2)
|
|
90 |
|
temp.add_edge(c1, c2)
|
|
91 |
|
|
|
92 |
|
#print(temp_name)
|
|
93 |
|
self.graph_gt = temp
|
|
94 |
|
self.graph_gt_labels = temp_name
|
|
95 |
|
|
|
96 |
|
# Check whether GT and NetworkX graphs have the same number of nodes and edges
|
|
97 |
|
# if (self.graph_gt.num_vertices() <> self.graph.number_of_nodes()):
|
|
98 |
|
# print('Unequal number of vertices detected at from node',from_node,'to node',to_node)
|
|
99 |
|
# print('Number of vertices in Gt Graph is',self.graph_gt.num_vertices())
|
|
100 |
|
# print('Number of vertices in NetworkX is',self.graph.number_of_nodes())
|
|
101 |
|
# else:
|
|
102 |
|
# print('Equal number of vertices in both graphs')
|
|
103 |
|
|
|
104 |
|
# if (self.graph_gt.num_edges() <> self.graph.number_of_edges()):
|
|
105 |
|
# print('Unequal number of edges detected at from node',from_node,'to node',to_node)
|
|
106 |
|
# print('Number of vertices in Gt Graph is',self.graph_gt.num_edges())
|
|
107 |
|
# print('Number of vertices in NetworkX is',self.graph.number_of_edges())
|
|
108 |
|
# else:
|
|
109 |
|
# print('Equal number of edges in both graphs')
|
|
110 |
|
|
|
111 |
|
# if (self.graph.number_of_nodes() <> self.graph_gt.
|
|
112 |
|
# print('Graph tool graph is',self.graph_gt)
|
|
113 |
|
# print('Graph tool labels map is',self.graph_gt_labels)
|
|
114 |
|
|
|
115 |
|
|
|
116 |
|
|
|
117 |
|
|
File metric_calculator.py added (mode: 100644) (index 0000000..c2cc665) |
|
1 |
|
import networkx as nx
|
|
2 |
|
import graph_tool.all as gt
|
|
3 |
|
import redis as rd
|
|
4 |
|
import numpy as np
|
|
5 |
|
import indexing
|
|
6 |
|
import statistics
|
|
7 |
|
import normalizations
|
|
8 |
|
import config
|
|
9 |
|
import datetime as dt
|
|
10 |
|
|
|
11 |
|
|
|
12 |
|
class MetricCalculator(object):
|
|
13 |
|
def __init__ (self, graph,graph_gt):
|
|
14 |
|
#class constructor
|
|
15 |
|
#define required class variables such as the graph to work on, the redis connection and the nodes of the graph
|
|
16 |
|
|
|
17 |
|
print ('Starting metric_calculator!')
|
|
18 |
|
self.graph = graph
|
|
19 |
|
self.graph_gt = graph_gt
|
|
20 |
|
# self.graph_gt_labels = graph_gt_labels
|
|
21 |
|
self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
|
|
22 |
|
self.nodes = nx.nodes(graph)
|
|
23 |
|
|
|
24 |
|
|
|
25 |
|
# configuration variables are read from the config file and are also saved to class variables for easy access
|
|
26 |
|
self.node_index_key = config.node_index_key
|
|
27 |
|
self.metric_index_key = config.metric_index_key
|
|
28 |
|
self.score_index_key = config.score_index_key
|
|
29 |
|
|
|
30 |
|
self.node_neighbors_prefix = config.node_neighbors_prefix
|
|
31 |
|
self.node_prefix = config.node_prefix
|
|
32 |
|
self.metric_prefix = config.metric_prefix
|
|
33 |
|
self.score_prefix = config.score_prefix
|
|
34 |
|
self.statistics_prefix = config.statistics_prefix
|
|
35 |
|
|
|
36 |
|
self.normalization_suffix = config.normalization_suffix
|
|
37 |
|
|
|
38 |
|
self.base_metrics = config.base_metrics
|
|
39 |
|
self.advanced_metrics = config.advanced_metrics
|
|
40 |
|
|
|
41 |
|
self.normalization_methods = config.normalization_methods
|
|
42 |
|
|
|
43 |
|
self.scores = config.scores
|
|
44 |
|
self.advanced_scores = config.advanced_scores
|
|
45 |
|
|
|
46 |
|
|
|
47 |
|
|
|
48 |
|
def start(self):
|
|
49 |
|
#clean all data in Redis
|
|
50 |
|
self.redis.flushdb()
|
|
51 |
|
|
|
52 |
|
#index creation
|
|
53 |
|
self.create_indexes()
|
|
54 |
|
|
|
55 |
|
|
|
56 |
|
#main calculations
|
|
57 |
|
self.calculate_metrics()
|
|
58 |
|
self.calculate_advanced_metrics()
|
|
59 |
|
self.normalize_metrics()
|
|
60 |
|
self.calculate_scores()
|
|
61 |
|
self.calculate_advanced_scores()
|
|
62 |
|
|
|
63 |
|
#statistics
|
|
64 |
|
self.calculate_statistics()
|
|
65 |
|
|
|
66 |
|
##################
|
|
67 |
|
#### INDEXING ####
|
|
68 |
|
##################
|
|
69 |
|
def create_indexes(self):
|
|
70 |
|
#call methods defined in indexing.py
|
|
71 |
|
indexing.index_nodes(self)
|
|
72 |
|
indexing.index_neighbors(self)
|
|
73 |
|
indexing.index_metrics(self)
|
|
74 |
|
indexing.index_scores(self)
|
|
75 |
|
|
|
76 |
|
###########################
|
|
77 |
|
#### CALCULATION LOOPS ####
|
|
78 |
|
###########################
|
|
79 |
|
|
|
80 |
|
def calculate_metrics(self):
|
|
81 |
|
# loop through all defined metrics and call specified calculation method for each node
|
|
82 |
|
print ('Starting calculate_metrics')
|
|
83 |
|
for metric_name in self.base_metrics:
|
|
84 |
|
metric_method = self.base_metrics[metric_name]
|
|
85 |
|
|
|
86 |
|
# loop through all nodes
|
|
87 |
|
for node in self.nodes:
|
|
88 |
|
# call calculation method of supplied metric for current node
|
|
89 |
|
node = int(node)
|
|
90 |
|
value = float(metric_method(self,node))
|
|
91 |
|
|
|
92 |
|
#store result in node values
|
|
93 |
|
self.redis.hset(self.node_prefix+str(node), metric_name, value)
|
|
94 |
|
|
|
95 |
|
#also store result to metric set
|
|
96 |
|
self.redis.zadd(self.metric_prefix+metric_name, value, str(node))
|
|
97 |
|
|
|
98 |
|
|
|
99 |
|
def calculate_advanced_metrics(self):
|
|
100 |
|
# loop through all defined_advanced_metrics and call specified calculation method
|
|
101 |
|
print ('Starting calculate_advanced_metrics')
|
|
102 |
|
for advanced_metric_name in self.advanced_metrics:
|
|
103 |
|
metric_method = self.advanced_metrics[advanced_metric_name]
|
|
104 |
|
|
|
105 |
|
# loop through all nodes
|
|
106 |
|
for node in self.nodes:
|
|
107 |
|
node = int(node)
|
|
108 |
|
value = float(metric_method(self,node))
|
|
109 |
|
|
|
110 |
|
#store result in node values
|
|
111 |
|
self.redis.hset(self.node_prefix+str(node), advanced_metric_name, value)
|
|
112 |
|
|
|
113 |
|
#also store result to metric set
|
|
114 |
|
self.redis.zadd(self.metric_prefix+advanced_metric_name, value, str(node))
|
|
115 |
|
|
|
116 |
|
|
|
117 |
|
# loop through all defined normalizations and call respective normalization method
|
|
118 |
|
# no default normalizations for metrics not listed in the "normalization_methods" hash
|
|
119 |
|
def normalize_metrics(self):
|
|
120 |
|
#fallback normalization: min-max
|
|
121 |
|
print ('Starting normalize_metrics')
|
|
122 |
|
all_metrics = dict(self.base_metrics.items() + self.advanced_metrics.items())
|
|
123 |
|
|
|
124 |
|
for metric_name in all_metrics:
|
|
125 |
|
if self.normalization_methods.has_key(metric_name):
|
|
126 |
|
normalization_method = self.normalization_methods[metric_name]
|
|
127 |
|
else:
|
|
128 |
|
#fallback normalization is min-max
|
|
129 |
|
normalization_method = normalizations.min_max
|
|
130 |
|
normalization_method(self,metric_name)
|
|
131 |
|
|
|
132 |
|
|
|
133 |
|
def calculate_scores(self):
|
|
134 |
|
print ('Starting calculate_scores')
|
|
135 |
|
for score_name in self.scores:
|
|
136 |
|
metrics_with_weights = self.scores[score_name]
|
|
137 |
|
|
|
138 |
|
for node in self.nodes:
|
|
139 |
|
score_value = 0.0
|
|
140 |
|
|
|
141 |
|
# get normalized values
|
|
142 |
|
for metric in metrics_with_weights:
|
|
143 |
|
weight = self.scores[score_name][metric]
|
|
144 |
|
value = float(self.redis.hget(self.node_prefix+str(node),metric+self.normalization_suffix))
|
|
145 |
|
score_value += weight * value
|
|
146 |
|
|
|
147 |
|
self.redis.hset(self.node_prefix+str(node),score_name, score_value)
|
|
148 |
|
self.redis.zadd(self.score_prefix+score_name, score_value, str(node))
|
|
149 |
|
|
|
150 |
|
def calculate_advanced_scores(self):
|
|
151 |
|
print ('Starting calculate_advanced_scores')
|
|
152 |
|
for advanced_score in self.advanced_scores:
|
|
153 |
|
self.advanced_scores[advanced_score](self)
|
|
154 |
|
|
|
155 |
|
|
|
156 |
|
#############
|
|
157 |
|
# statistics
|
|
158 |
|
#############
|
|
159 |
|
|
|
160 |
|
def calculate_statistics(self):
|
|
161 |
|
print ('Starting calculate_statistics')
|
|
162 |
|
for metric in self.base_metrics:
|
|
163 |
|
#absolute and normalized
|
|
164 |
|
statistics.calculate_statistics(self, metric, self.metric_prefix+metric)
|
|
165 |
|
statistics.calculate_statistics(self, metric+self.normalization_suffix, self.metric_prefix+metric+self.normalization_suffix)
|
|
166 |
|
|
|
167 |
|
for advanced_metric in self.advanced_metrics:
|
|
168 |
|
#absolute and normalized
|
|
169 |
|
statistics.calculate_statistics(self, advanced_metric, self.metric_prefix+advanced_metric)
|
|
170 |
|
statistics.calculate_statistics(self, advanced_metric+self.normalization_suffix, self.metric_prefix+advanced_metric+self.normalization_suffix)
|
|
171 |
|
|
|
172 |
|
for score in self.scores:
|
|
173 |
|
statistics.calculate_statistics(self, score, self.score_prefix+score)
|
|
174 |
|
|
|
175 |
|
for advanced_score in self.advanced_scores:
|
|
176 |
|
statistics.calculate_statistics(self, advanced_score, self.score_prefix+advanced_score)
|
|
177 |
|
|
|
178 |
|
statistics.calculate_correlations(self)
|
|
179 |
|
|
File metrics.py added (mode: 100644) (index 0000000..6673fc8) |
|
1 |
|
#metrics.py
|
|
2 |
|
import networkx as nx
|
|
3 |
|
import numpy as np
|
|
4 |
|
import datetime as dt
|
|
5 |
|
import graph_tool.all as gt
|
|
6 |
|
|
|
7 |
|
def clustering_coefficient(self,node):
|
|
8 |
|
print ('Calculating clustering_coefficient for node',node)
|
|
9 |
|
#in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
|
|
10 |
|
#NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
|
|
11 |
|
# in this case, just returning nx.clustering(self.graph, node) might be better
|
|
12 |
|
if not hasattr(self, 'all_clustering_coefficients'):
|
|
13 |
|
self.all_clustering_coefficients = nx.clustering(self.graph)
|
|
14 |
|
|
|
15 |
|
#get the actual value from the pre-calculated hash
|
|
16 |
|
return self.all_clustering_coefficients[node]
|
|
17 |
|
|
|
18 |
|
def degree(self, node):
|
|
19 |
|
print('Calculating degree for node', node)
|
|
20 |
|
return self.graph.degree(node)
|
|
21 |
|
|
|
22 |
|
|
|
23 |
|
def average_neighbor_degree(self,node):
|
|
24 |
|
print('Calculating average_neighbour_degree for node',node)
|
|
25 |
|
# same caching technique as in self.clustering_coefficient
|
|
26 |
|
# might also break for very large graphs
|
|
27 |
|
# nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
|
|
28 |
|
|
|
29 |
|
if not hasattr(self, 'all_average_neighbor_degrees'):
|
|
30 |
|
self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
|
|
31 |
|
return self.all_average_neighbor_degrees[node]
|
|
32 |
|
|
|
33 |
|
def iterated_average_neighbor_degree(self, node):
|
|
34 |
|
print('Calculating iterated_average_neighbor degree for node',node)
|
|
35 |
|
first_level_neighbors = self.graph.neighbors(node)
|
|
36 |
|
# print ('First level neigbors are', first_level_neighbors)
|
|
37 |
|
second_level_neighbors = []
|
|
38 |
|
# print ('Second level neigbors are', second_level_neighbors)
|
|
39 |
|
# get all two-hop nodes
|
|
40 |
|
for first_level_neighbor in first_level_neighbors:
|
|
41 |
|
current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
|
|
42 |
|
second_level_neighbors.extend(current_second_level_neighbors)
|
|
43 |
|
|
|
44 |
|
#remove one-hop nodes and self
|
|
45 |
|
relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
|
|
46 |
|
|
|
47 |
|
degree_sum = 0
|
|
48 |
|
for relevant_node in relevant_nodes:
|
|
49 |
|
degree_sum += self.graph.degree(relevant_node)
|
|
50 |
|
|
|
51 |
|
if float(len(relevant_nodes)) <> 0:
|
|
52 |
|
return float(degree_sum)/float(len(relevant_nodes))
|
|
53 |
|
else:
|
|
54 |
|
return 0
|
|
55 |
|
|
|
56 |
|
def eccentricity(self, node):
|
|
57 |
|
print('Calculating eccentricity for node', node)
|
|
58 |
|
if not hasattr(self, 'all_eccentricities'):
|
|
59 |
|
l = gt.label_largest_component(self.graph_gt['graph_gt'],directed = None) #find the largest component
|
|
60 |
|
print ('Found the largest component')
|
|
61 |
|
# print ("Printing labeled largest component",l.a)
|
|
62 |
|
u = gt.GraphView(self.graph_gt['graph_gt'], vfilt=l) # extract the largest component as a graph
|
|
63 |
|
print('The number of vertices in the largest component is',u.num_vertices())
|
|
64 |
|
print('The number of vertices in the original graph is', nx.number_of_nodes(self.graph))
|
|
65 |
|
# if nx.is_connected(self.graph) == True:
|
|
66 |
|
if (u.num_vertices() == nx.number_of_nodes(self.graph)):
|
|
67 |
|
print ("Graph is connected")
|
|
68 |
|
self.all_eccentricities = nx.eccentricity(self.graph)
|
|
69 |
|
print ("Calculated all eccentricities")
|
|
70 |
|
# print("Eccentricities are",self.all_eccentricities)
|
|
71 |
|
return self.all_eccentricities[node]
|
|
72 |
|
else:
|
|
73 |
|
# return 0
|
|
74 |
|
print("Graph is disconnected")
|
|
75 |
|
self.all_eccentricities = {}
|
|
76 |
|
if (self.all_eccentricities != {}):
|
|
77 |
|
print("Returning eccentricity for",node,"-",self.all_eccentricities[node])
|
|
78 |
|
return self.all_eccentricities[node]
|
|
79 |
|
else:
|
|
80 |
|
print("Returning 0")
|
|
81 |
|
return 0
|
|
82 |
|
|
|
83 |
|
def betweenness_centrality(self, node):
|
|
84 |
|
print('Calculating betweenness_centrality for node',node)
|
|
85 |
|
if not hasattr(self, 'all_betweenness_centralities'):
|
|
86 |
|
self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
|
|
87 |
|
return self.all_betweenness_centralities[node]
|
|
88 |
|
|
|
89 |
|
|
|
90 |
|
def betweenness_centrality_gt(self, node):
|
|
91 |
|
print('Calculating betweenness_centrality with graph_tool for node',node)
|
|
92 |
|
# print('Self is',self.graph_gt['graph_gt'])
|
|
93 |
|
# print('Self is also',self.graph_gt['graph_gt_labels'])
|
|
94 |
|
# def convert_graph(g):
|
|
95 |
|
#converts a networkX graph to graph_tool
|
|
96 |
|
#important : NetworkX node indexes start with 1, whereas Graph tool node indexes start with 0
|
|
97 |
|
# adj = nx.adjacency_matrix(g)
|
|
98 |
|
# j = gt.Graph(directed=False)
|
|
99 |
|
# j.add_vertex(len(adj))
|
|
100 |
|
# num_vertices = adj.shape[0]
|
|
101 |
|
# for i in range(num_vertices - 1):
|
|
102 |
|
# for l in range(i + 1, num_vertices):
|
|
103 |
|
# if adj[i,l] != 0:
|
|
104 |
|
# j.add_edge(i, l)
|
|
105 |
|
# return j
|
|
106 |
|
|
|
107 |
|
|
|
108 |
|
if not hasattr(self, 'all_betweenness_centralities_gt'):
|
|
109 |
|
vp,ep = gt.betweenness(self.graph_gt['graph_gt'])
|
|
110 |
|
self.all_betweenness_centralities_gt = vp
|
|
111 |
|
|
|
112 |
|
node_label = gt.find_vertex(self.graph_gt['graph_gt'],self.graph_gt['graph_gt_labels'],node)
|
|
113 |
|
# print("Node",node,"has index",node_label)
|
|
114 |
|
# print('Vp is',vp)
|
|
115 |
|
# print('Betweenness centrality of node',node,'is',vp[self.graph_gt['graph_gt'].vertex(node_label[0])])
|
|
116 |
|
|
|
117 |
|
return self.all_betweenness_centralities_gt[self.graph_gt['graph_gt'].vertex(node_label[0])]
|
|
118 |
|
|
|
119 |
|
def average_shortest_path_length(self, node):
|
|
120 |
|
print('Calculating average_shortes_path_length for node',node)
|
|
121 |
|
# caching average_shortest_path_length for all nodes at one failed
|
|
122 |
|
# already switched to single calculation
|
|
123 |
|
|
|
124 |
|
#get all shortest path lengths
|
|
125 |
|
all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
|
|
126 |
|
|
|
127 |
|
#calculate average
|
|
128 |
|
sum_of_lengths = 0
|
|
129 |
|
for target in all_shortest_path_lengths_for_node:
|
|
130 |
|
sum_of_lengths += all_shortest_path_lengths_for_node[target]
|
|
131 |
|
|
|
132 |
|
return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
|
|
133 |
|
|
|
134 |
|
|
|
135 |
|
#############
|
|
136 |
|
# advanced metrics
|
|
137 |
|
#############
|
|
138 |
|
def correct_clustering_coefficient(self,node):
|
|
139 |
|
print('Calculating correct_clustering_coefficient for node',node)
|
|
140 |
|
clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
|
|
141 |
|
degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
|
|
142 |
|
corrected_cc = clustering_coefficient + (degree * clustering_coefficient) / float(4)
|
|
143 |
|
return corrected_cc
|
|
144 |
|
|
|
145 |
|
def correct_average_neighbor_degree(self,node):
|
|
146 |
|
print('Calculating correct_average_neighbor degree for node',node)
|
|
147 |
|
avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
|
|
148 |
|
|
|
149 |
|
neighbors = self.graph.neighbors(node)
|
|
150 |
|
number_of_neighbors = float(len(neighbors))
|
|
151 |
|
neighbor_degrees = []
|
|
152 |
|
for neighbor in neighbors:
|
|
153 |
|
neighbor_degrees.append(self.graph.degree(neighbor))
|
|
154 |
|
|
|
155 |
|
#using numpy median and standard deviation implementation
|
|
156 |
|
numpy_neighbor_degrees = np.array(neighbor_degrees)
|
|
157 |
|
median = np.median(numpy_neighbor_degrees)
|
|
158 |
|
standard_deviation = np.std(numpy_neighbor_degrees)
|
|
159 |
|
|
|
160 |
|
if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
|
|
161 |
|
return avgnd
|
|
162 |
|
else:
|
|
163 |
|
return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
|
|
164 |
|
|
|
165 |
|
|
|
166 |
|
def correct_iterated_average_neighbor_degree(self, node):
|
|
167 |
|
print('Calculating correct_iterated_avverage_neighbour_gegree for node',node)
|
|
168 |
|
avgnd = float(self.redis.hget(self.node_prefix+str(node), 'iterated_average_neighbor_degree'))
|
|
169 |
|
|
|
170 |
|
first_level_neighbors = self.graph.neighbors(node)
|
|
171 |
|
second_level_neighbors = []
|
|
172 |
|
|
|
173 |
|
# get all two-hop nodes
|
|
174 |
|
for first_level_neighbor in first_level_neighbors:
|
|
175 |
|
current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
|
|
176 |
|
second_level_neighbors.extend(current_second_level_neighbors)
|
|
177 |
|
|
|
178 |
|
#remove one-hop neighbors and self
|
|
179 |
|
relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
|
|
180 |
|
|
|
181 |
|
number_of_nodes = len(relevant_nodes)
|
|
182 |
|
node_degrees = []
|
|
183 |
|
for rel_node in relevant_nodes:
|
|
184 |
|
node_degrees.append(self.graph.degree(rel_node))
|
|
185 |
|
|
|
186 |
|
numpy_node_degrees = np.array(node_degrees)
|
|
187 |
|
median = np.median(numpy_node_degrees)
|
|
188 |
|
standard_deviation = np.std(numpy_node_degrees)
|
|
189 |
|
|
|
190 |
|
if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
|
|
191 |
|
return avgnd
|
|
192 |
|
else:
|
|
193 |
|
return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
|
|
194 |
|
|
|
195 |
|
|
File normalizations.py added (mode: 100644) (index 0000000..a959a8c) |
|
1 |
|
#normalizations.py
|
|
2 |
|
def min_max(self,metric_name):
|
|
3 |
|
#perform min max normalization of specified metric for all nodes
|
|
4 |
|
#min_max normalization
|
|
5 |
|
#get min and max from redis
|
|
6 |
|
x_min = self.redis.zrange(self.metric_prefix+metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
|
|
7 |
|
x_max = self.redis.zrange(self.metric_prefix+metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
|
|
8 |
|
|
|
9 |
|
#print x_min
|
|
10 |
|
#print x_max
|
|
11 |
|
|
|
12 |
|
for node in self.nodes:
|
|
13 |
|
if x_min == x_max:
|
|
14 |
|
x_normalized = 1.0
|
|
15 |
|
else:
|
|
16 |
|
x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
|
|
17 |
|
x_normalized = (x - x_min) / (x_max - x_min)
|
|
18 |
|
|
|
19 |
|
#store value for node and metric
|
|
20 |
|
self.redis.zadd(self.metric_prefix+metric_name+self.normalization_suffix, x_normalized, str(node))
|
|
21 |
|
self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
|
|
22 |
|
|
|
23 |
|
#max min normalization
|
|
24 |
|
def max_min(self,metric_name):
|
|
25 |
|
x_min = self.redis.zrange(self.metric_prefix+metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
|
|
26 |
|
x_max = self.redis.zrange(self.metric_prefix+metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
|
|
27 |
|
|
|
28 |
|
for node in self.nodes:
|
|
29 |
|
if x_min == x_max:
|
|
30 |
|
x_normalized = 1.0
|
|
31 |
|
else:
|
|
32 |
|
x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
|
|
33 |
|
x_normalized = (x_max - x) / (x_max - x_min)
|
|
34 |
|
|
|
35 |
|
#store value for node and metric
|
|
36 |
|
self.redis.zadd(self.metric_prefix+metric_name+self.normalization_suffix, x_normalized, str(node))
|
|
37 |
|
self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized) |
File pearson.py added (mode: 100644) (index 0000000..7a6cc1c) |
|
1 |
|
import redis as rd
|
|
2 |
|
import numpy as np
|
|
3 |
|
from scipy.stats import pearsonr
|
|
4 |
|
|
|
5 |
|
metrics = ['clustering_coefficient',
|
|
6 |
|
'degree',
|
|
7 |
|
'average_neighbor_degree',
|
|
8 |
|
'iterated_average_neighbor_degree',
|
|
9 |
|
'betweenness_centrality',
|
|
10 |
|
'eccentricity',
|
|
11 |
|
'average_shortest_path_length',
|
|
12 |
|
'corrected_clustering_coefficient',
|
|
13 |
|
'corrected_average_neighbor_degree',
|
|
14 |
|
'corrected_iterated_average_neighbor_degree']
|
|
15 |
|
|
|
16 |
|
rdb = rd.StrictRedis(host='localhost', port=6379, db=0)
|
|
17 |
|
|
|
18 |
|
|
|
19 |
|
correlations = {}
|
|
20 |
|
for metric1 in metrics:
|
|
21 |
|
correlations[metric1] = {}
|
|
22 |
|
for metric2 in metrics:
|
|
23 |
|
correlations[metric1][metric2] = (0,0)
|
|
24 |
|
if metric1 == metric2:
|
|
25 |
|
correlations[metric1][metric2] = (1,0)
|
|
26 |
|
continue
|
|
27 |
|
|
|
28 |
|
dict_metric1 = dict(rdb.zrange(metric1, 0, -1, withscores=True, score_cast_func=float))
|
|
29 |
|
dict_metric2 = dict(rdb.zrange(metric2, 0, -1, withscores=True, score_cast_func=float))
|
|
30 |
|
|
|
31 |
|
values_metric1 = []
|
|
32 |
|
values_metric2 = []
|
|
33 |
|
|
|
34 |
|
for key in sorted(dict_metric1.iterkeys()):
|
|
35 |
|
values_metric1.append(dict_metric1[key])
|
|
36 |
|
|
|
37 |
|
for key in sorted(dict_metric2.iterkeys()):
|
|
38 |
|
values_metric2.append(dict_metric2[key])
|
|
39 |
|
|
|
40 |
|
correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
|
|
41 |
|
|
|
42 |
|
for source in correlations:
|
|
43 |
|
for target in correlations[source]:
|
|
44 |
|
rdb.hset("correlations:"+source+":"+target, "correlation", correlations[source][target][0])
|
|
45 |
|
rdb.hset("correlations:"+source+":"+target, "confidence", correlations[source][target][1]) |
File start.py added (mode: 100644) (index 0000000..26df05e) |
|
1 |
|
#!/usr/bin/env python |
|
2 |
|
import datetime |
|
3 |
|
import argparse |
|
4 |
|
import cProfile, pstats, StringIO |
|
5 |
|
from file_importer import FileImporter |
|
6 |
|
from metric_calculator import MetricCalculator |
|
7 |
|
import datetime as dt |
|
8 |
|
|
|
9 |
|
print 'Starting metric calculation',dt.datetime.now() |
|
10 |
|
parser = argparse.ArgumentParser(description='Read a Tab-separated Graph Datafile and start Calculation of Metrics and Statistics as configured in config.py') |
|
11 |
|
|
|
12 |
|
parser.add_argument('filename', metavar='filename', type=str, |
|
13 |
|
help='the name of the data file containing tab separated node ids') |
|
14 |
|
|
|
15 |
|
parser.add_argument('--profiling',dest='profiling',action='store_true', help='enable runtime profiling into profiling.txt file') |
|
16 |
|
|
|
17 |
|
args = parser.parse_args() |
|
18 |
|
|
|
19 |
|
if args.profiling: |
|
20 |
|
pr = cProfile.Profile() |
|
21 |
|
s = StringIO.StringIO() |
|
22 |
|
timestamp = str(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')) |
|
23 |
|
outfile = open('profiling_output_'+timestamp+'.txt', 'w') |
|
24 |
|
pr.enable() |
|
25 |
|
|
|
26 |
|
fi = FileImporter(args.filename) |
|
27 |
|
graph = fi.read() |
|
28 |
|
#print('This should be a Network X graph',graph) |
|
29 |
|
print('Network X graph has the following number of nodes',graph.number_of_nodes()) |
|
30 |
|
print('Network X graph has the following number of edges',graph.number_of_edges()) |
|
31 |
|
graph_gt = fi.read_gt() |
|
32 |
|
print('Graph tool graph has the following number of nodes',graph_gt['graph_gt'].num_vertices()) |
|
33 |
|
print('Graph tool graph has the following number of edges',graph_gt['graph_gt'].num_edges()) |
|
34 |
|
#print('Gt graph has the following properties') |
|
35 |
|
mc = MetricCalculator(graph,graph_gt) |
|
36 |
|
mc.start() |
|
37 |
|
|
|
38 |
|
if args.profiling: |
|
39 |
|
ps = pstats.Stats(pr, stream=s).sort_stats('cumulative') |
|
40 |
|
ps.print_stats() |
|
41 |
|
outfile.write(s.getvalue()) |
|
42 |
|
|
|
43 |
|
print 'Ending metric calculation',dt.datetime.now() |
File statistics.py added (mode: 100644) (index 0000000..fb03eaa) |
|
1 |
|
#statistics.py
|
|
2 |
|
import redis as rd
|
|
3 |
|
import numpy as np
|
|
4 |
|
from scipy.stats import pearsonr
|
|
5 |
|
|
|
6 |
|
def calculate_statistics(self,metric,redis_key):
|
|
7 |
|
all_values = dict(self.redis.zrange(redis_key, 0, -1, withscores=True, score_cast_func=float)).values()
|
|
8 |
|
min_value = np.min(all_values)
|
|
9 |
|
max_value = np.max(all_values)
|
|
10 |
|
|
|
11 |
|
average = np.average(all_values)
|
|
12 |
|
median = np.median(all_values)
|
|
13 |
|
standard_deviation = np.std(all_values)
|
|
14 |
|
|
|
15 |
|
self.redis.hset(self.statistics_prefix+metric, 'min', min_value)
|
|
16 |
|
self.redis.hset(self.statistics_prefix+metric, 'max', max_value)
|
|
17 |
|
self.redis.hset(self.statistics_prefix+metric, 'average', average)
|
|
18 |
|
self.redis.hset(self.statistics_prefix+metric, 'median', median)
|
|
19 |
|
self.redis.hset(self.statistics_prefix+metric, 'standard_deviation', standard_deviation)
|
|
20 |
|
|
|
21 |
|
|
|
22 |
|
def calculate_correlations(self):
|
|
23 |
|
m = self.base_metrics.keys()
|
|
24 |
|
c = self.advanced_metrics.keys()
|
|
25 |
|
|
|
26 |
|
metrics = m + c
|
|
27 |
|
|
|
28 |
|
correlations = {}
|
|
29 |
|
for metric1 in metrics:
|
|
30 |
|
correlations[metric1] = {}
|
|
31 |
|
for metric2 in metrics:
|
|
32 |
|
correlations[metric1][metric2] = (0,0)
|
|
33 |
|
if metric1 == metric2:
|
|
34 |
|
correlations[metric1][metric2] = (1,0)
|
|
35 |
|
continue
|
|
36 |
|
|
|
37 |
|
dict_metric1 = dict(self.redis.zrange(self.metric_prefix+metric1, 0, -1, withscores=True, score_cast_func=float))
|
|
38 |
|
dict_metric2 = dict(self.redis.zrange(self.metric_prefix+metric2, 0, -1, withscores=True, score_cast_func=float))
|
|
39 |
|
values_metric1 = []
|
|
40 |
|
values_metric2 = []
|
|
41 |
|
|
|
42 |
|
for key in sorted(dict_metric1.iterkeys()):
|
|
43 |
|
values_metric1.append(dict_metric1[key])
|
|
44 |
|
|
|
45 |
|
for key in sorted(dict_metric2.iterkeys()):
|
|
46 |
|
values_metric2.append(dict_metric2[key])
|
|
47 |
|
|
|
48 |
|
correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
|
|
49 |
|
|
|
50 |
|
values_metric1 = []
|
|
51 |
|
values_metric2 = []
|
|
52 |
|
|
|
53 |
|
for source in correlations:
|
|
54 |
|
for target in correlations[source]:
|
|
55 |
|
self.redis.hset(self.statistics_prefix+"correlations:"+source+":"+target, "correlation", correlations[source][target][0])
|
|
56 |
|
self.redis.hset(self.statistics_prefix+"correlations:"+source+":"+target, "confidence", correlations[source][target][1]) |