Subject SHA-1 Author Date
Fix redis DB configuration 5ccfec34b4bcb3323071ea5667da2a375e219a3e Vasilis Ververis 2017-04-10 19:42:42
Fix redis DB in config dffd5d2deb0c5d8043ff43fc3fe6cb1efd3bc990 Vasilis Ververis 2017-04-10 10:23:15
Set proper host address to redis, remove extra spaces, set ff to unix dd87734e46d0756f7d7f48b5de5591ffd487c8ab Vasilis Ververis 2017-04-04 22:48:03
Add redis conf, datasets, gitignore, convert files 34fd00db0525b875e2f9afbe5a10af28fe06b03b Vasilis Ververis 2017-03-16 09:52:30
Commig cdebeb923331f9081529a023c00cb1f0543e3d55 Mathias Ehlert 2014-12-07 16:07:08
Commit cdebeb923331f9081529a023c00cb1f0543e3d55 - Commig
Author: Mathias Ehlert
Author date (UTC): 2014-12-07 16:07
Committer: Mathias Ehlert
Commit date (UTC): 2014-12-07 16:07
Tree: e7ad3f3e3de345cdeb26e5e821602ffb64533857
File Lines added Lines deleted
README.md 7 0
__init__.py 0 0
advancedscores.py 33 0
advancedscores.pyc 0 0
config.py 84 0
config.pyc 0 0
file_importer.py 117 0
file_importer.pyc 0 0
graph tool test.py 99 0
indexing.py 22 0
indexing.pyc 0 0
log 644344 0
metric_calculator.py 179 0
metric_calculator.pyc 0 0
metrics.py 195 0
metrics.pyc 0 0
normalizations.py 37 0
normalizations.pyc 0 0
pearson.py 45 0
start.py 43 0
statistics.py 56 0
statistics.pyc 0 0

File README.md added (mode: 100644) (index 0000000..924a1df)
1 coria-backend
2 =============
3
4 Connectivity Risk Analysis Python Backend
5
6 usage: start.py [-h] filename
7

File __init__.py added (mode: 100644) (index 0000000..e69de29)

File advancedscores.py added (mode: 100644) (index 0000000..489636a)
1 # advancedscores.py
2 import numpy as np
3
4 ################
5 #advanced scores
6 ################
7
8 def adv_unified_risk_score(self):
9
10 #caching of all values in dictionaries
11 all_ccs_normalized = dict(self.redis.zrange(self.metric_prefix+'corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
12 all_urs = dict(self.redis.zrange(self.score_prefix+'unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
13
14 urs_percentile_10 = np.percentile(all_urs.values(), 10)
15 urs_percentile_90 = np.percentile(all_urs.values(), 90)
16
17 for node in self.nodes:
18 cc_normalized = all_ccs_normalized[str(node)]
19 urs = all_urs[str(node)]
20
21
22 if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
23 if (cc_normalized >= 0.25):
24 advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
25 else:
26 advanced_unified_risk_score = urs
27 else:
28 advanced_unified_risk_score = urs
29
30 #save for node
31 self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
32 #save for score
33 self.redis.zadd(self.score_prefix+'advanced_unified_risk_score', advanced_unified_risk_score, str(node))

File advancedscores.pyc added (mode: 100644) (index 0000000..ce98f24)

File config.py added (mode: 100644) (index 0000000..8410e0b)
1 #config.py
2 import metrics
3 import normalizations
4 import advancedscores
5
6 #redis keys for indexes and values
7 node_index_key = 'all_nodes'
8 metric_index_key = 'all_metrics'
9 score_index_key = 'all_scores'
10
11 node_neighbors_prefix = 'node_neighbors:'
12 node_prefix = 'node_metrics:'
13 metric_prefix = 'metric:'
14 score_prefix = 'score:'
15 statistics_prefix = 'statistics:'
16
17 normalization_suffix = '_normalized'
18
19 # definition of all base metrics for which absolute values will be calculcated for each node in the first step
20 # key is the name of the metric and value is the implemented method which exposes the required interface
21 # interface: each method takes the node as the single parameter, performs the necessary calculation and
22 # returns a float containing the value for the specified node
23
24 base_metrics = { 'clustering_coefficient' : metrics.clustering_coefficient,
25 'degree' : metrics.degree,
26 'average_neighbor_degree' : metrics.average_neighbor_degree,
27 'iterated_average_neighbor_degree': metrics.iterated_average_neighbor_degree,
28 # 'betweenness_centrality' : metrics.betweenness_centrality,
29 'betweenness_centrality_gt' : metrics.betweenness_centrality_gt,
30 # 'eccentricity' : metrics.eccentricity,
31 'average_shortest_path_length' : metrics.average_shortest_path_length
32 }
33
34
35 # some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
36 # key is the metric name and value the method for correction
37
38 advanced_metrics = {'corrected_clustering_coefficient' : metrics.correct_clustering_coefficient,
39 'corrected_average_neighbor_degree' : metrics.correct_average_neighbor_degree,
40 'corrected_iterated_average_neighbor_degree': metrics.correct_iterated_average_neighbor_degree}
41
42
43 # for every metric, a normalization method has to be specified
44 # key is the name of the metric and value is the normalization method which also has to expose the required interface
45 # interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
46 # the method itself shall access the data which is required for normalization from the redis instance
47 # and the corresponding keys/values for the specified metric
48 # it shall then loop over all nodes and calculate the normalized value for the node and the metric
49 # afterwards it should save the result to redis using "metric_name_normalized" as the key
50 # the result is stored inside the node's hash for metrics
51
52 # also needs to include corrected metrics with their respective names
53 #
54 normalization_methods = { 'clustering_coefficient' : normalizations.min_max,
55 'corrected_clustering_coefficient' : normalizations.min_max,
56 'degree' : normalizations.min_max,
57 'average_neighbor_degree' : normalizations.min_max,
58 'corrected_average_neighbor_degree' : normalizations.min_max,
59 'iterated_average_neighbor_degree' : normalizations.min_max,
60 'corrected_iterated_average_neighbor_degree': normalizations.min_max,
61 # 'betweenness_centrality' : normalizations.min_max,
62 'betweenness_centrality_gt' : normalizations.min_max,
63 # 'eccentricity' : normalizations.max_min,
64 'average_shortest_path_length' : normalizations.max_min
65 }
66
67
68 # the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
69 # such scores can easily be defined here
70 # note: names are not methods but redis keys
71
72 scores = {'unified_risk_score': { 'degree': 0.25,
73 'corrected_average_neighbor_degree': 0.15,
74 'corrected_iterated_average_neighbor_degree': 0.1,
75 'betweenness_centrality_gt': 0.25,
76 # 'eccentricity': 0.125,
77 'average_shortest_path_length': 0.25}
78 }
79
80
81 # other scores might require a more sophisticated algorithm to be calculated
82 # such scores need to be added here and implemented like the example below
83
84 advanced_scores = {'advanced_unified_risk_score': advancedscores.adv_unified_risk_score}

File config.pyc added (mode: 100644) (index 0000000..80b9cca)

File file_importer.py added (mode: 100644) (index 0000000..68b7ae9)
1 import networkx as nx
2 import graph_tool.all as gt
3
4 class FileImporter(object):
5 def __init__(self,filename):
6 # initialize data file to parse and new empty graph
7 print ('Starting file importer!')
8 self.data_file = open(filename)
9 self.graph = nx.Graph()
10 self.graph_gt = gt.Graph(directed=False)
11 self.graph_gt_labels = self.graph_gt.new_vertex_property("double")
12
13 def read(self):
14 for line in self.data_file:
15 print("Parsing line",line)
16 self.parse_line(line)
17 return self.graph
18 # return {'graph':self.graph, 'graph_gt':self.graph_gt, 'graph_gt_labels':self.graph_gt_labels}
19 #self.graph,self.graph_gt,self.graph_gt_labels
20
21 def read_gt(self):
22 return {'graph_gt':self.graph_gt, 'graph_gt_labels':self.graph_gt_labels}
23
24 def parse_line(self, line):
25 # split each line on tabstop
26 # first field specifies the source node
27 # second field specifies the target node
28
29 fields = line.strip().split("\t")
30 from_node = int(fields[0])
31 to_node = int(fields[1])
32
33 # print('\n')
34 # print('From node is',from_node)
35 # print('To node is',to_node)
36 # add edge to the networkx graph
37 if (from_node <> to_node):
38 self.graph.add_edge(from_node, to_node)
39 # print('Network X graph has the following number of nodes',self.graph.number_of_nodes())
40 # print('Network X graph has the following number of edges',self.graph.number_of_edges())
41
42
43
44 #add edge to the graph_tool graph and create a property map of labels
45 #check if nodes are already present and create new ones if not
46 #temp = gt.Graph(directed=False)
47 #temp_name = temp.new_vertex_property("string")
48 temp = self.graph_gt
49 temp_name = self.graph_gt_labels
50
51 check = None
52 if (from_node <> to_node): #check if from_node is the same as to_node
53 index_from = gt.find_vertex(temp,temp_name,from_node)
54 # print('Index from is',index_from)
55 index_to = gt.find_vertex(temp,temp_name,to_node)
56 # print('Index to is',index_to)
57 if (index_from == [] and index_to == []):
58 # print('No idences are found')
59 c1 = temp.add_vertex()
60 temp_name[temp.vertex(c1)] = from_node
61 # print('Temp_name is now',temp_name[temp.vertex(c1)])
62 c2 = temp.add_vertex()
63 temp_name[temp.vertex(c2)] = to_node
64 # print('Temp_name is now',temp_name[temp.vertex(c2)])
65 if (index_from <> [] and index_to == []) :
66 # print('Index from is')
67 # print(index_from[0])
68 c1 = index_from[0]
69 #print('C1 is',c1)
70 c2 = temp.add_vertex()
71 #print('C2 is'),
72 #print(c2)
73 temp_name[temp.vertex(c2)] = to_node
74 # print('Temp_name is now',temp_name[temp.vertex(c2)])
75 if (index_to <> [] and index_from ==[]) :
76 # print('Index to is')
77 # print(index_to[0])
78 c1 = temp.add_vertex()
79 c2 = index_to[0]
80 temp_name[temp.vertex(c1)] = from_node
81 # print('Temp_name is now',temp_name[temp.vertex(c1)])
82 if (index_from <> [] and index_to <> []) :
83 # print('Both vertices found')
84 c1 = index_to[0]
85 c2 = index_from[0]
86 check = temp.edge(c1,c2) #check if the edge is already present
87 # print('Check is',check)
88 if (check == None):
89 # print("Adding edge between",c1,"and",c2)
90 temp.add_edge(c1, c2)
91
92 #print(temp_name)
93 self.graph_gt = temp
94 self.graph_gt_labels = temp_name
95
96 # Check whether GT and NetworkX graphs have the same number of nodes and edges
97 # if (self.graph_gt.num_vertices() <> self.graph.number_of_nodes()):
98 # print('Unequal number of vertices detected at from node',from_node,'to node',to_node)
99 # print('Number of vertices in Gt Graph is',self.graph_gt.num_vertices())
100 # print('Number of vertices in NetworkX is',self.graph.number_of_nodes())
101 # else:
102 # print('Equal number of vertices in both graphs')
103
104 # if (self.graph_gt.num_edges() <> self.graph.number_of_edges()):
105 # print('Unequal number of edges detected at from node',from_node,'to node',to_node)
106 # print('Number of vertices in Gt Graph is',self.graph_gt.num_edges())
107 # print('Number of vertices in NetworkX is',self.graph.number_of_edges())
108 # else:
109 # print('Equal number of edges in both graphs')
110
111 # if (self.graph.number_of_nodes() <> self.graph_gt.
112 # print('Graph tool graph is',self.graph_gt)
113 # print('Graph tool labels map is',self.graph_gt_labels)
114
115
116
117

File file_importer.pyc added (mode: 100644) (index 0000000..0d69976)

File graph tool test.py added (mode: 100644) (index 0000000..b09bec7)
1 import graph_tool.all as gt
2 import networkx as nx
3 import matplotlib as mp
4 import matplotlib.pyplot as plt
5
6 g = nx.Graph()
7 g.add_edge(1,2)
8 g.add_edge(2,3)
9 g.add_edge(1,8)
10 print(g.nodes())
11 print(g.edges())
12 adj = nx.adjacency_matrix(g)
13 print(adj)
14
15 test = gt.Graph(directed=False)
16 test_name = test.new_vertex_property("string")
17 c1=test.add_vertex()
18 test_name[test.vertex(c1)] = "A"
19 c2=test.add_vertex()
20 test_name[test.vertex(c2)] = "B"
21 c=test.add_vertex()
22 test_name[test.vertex(c)] = "C"
23 test.add_edge(c1, c2)
24 gt.graph_draw(test)
25 found = gt.find_vertex(test,test_name,"C")
26 print(found)
27 if found <> []:
28 print("Index of B is")
29 print(found[0])
30 if (int(found[0])==1):
31 print("True")
32 else :
33 print("False")
34
35 #print('Index of B is',vertex.pop(0))
36
37 for v in test.vertices():
38 print v
39 print test.vertex(v)
40 print test_name[v]
41 # print(vp[test.vertex(v)])
42
43
44 def convert_graph(g):
45 #converts a networkX graph to graph_tool
46 #important : NetworkX node indexes start with 1, whereas Graph tool node indexes start with 0
47 j = gt.Graph(directed=False)
48 j.vertex_index
49 j.add_vertex(len(adj))
50 num_vertices = adj.shape[0]
51 print (num_vertices)
52 for i in range(num_vertices - 1):
53 for l in range(i + 1, num_vertices):
54 if adj[i,l] != 0:
55 j.add_edge(i, l)
56 return j
57
58 j = convert_graph(g)
59
60 for v in j.vertices():
61 print(v)
62 for e in j.edges():
63 print(e)
64
65 bg = nx.betweenness_centrality(g)
66 vp,ep = gt.betweenness(j)
67
68 print(bg)
69
70 #for u in range(1,len(bg)+1):
71 # print u
72 # print(bg[u])
73
74 for v in j.vertices():
75 print v
76 print(vp[j.vertex(v)])
77
78 #nx.draw(g)
79 #plt.draw()
80 #plt.show()
81 #gt.graph_draw(j)
82
83
84 #g = gt.collection.data["polblogs"]
85 #v1 = g.add_vertex()
86 #v2 = g.add_vertex()
87 #v3 = g.add_vertex()
88 #e = g.add_edge(v2, v1)
89 #f = g.add_edge(v3, v1)
90 #print(v1.out_degree())
91 #vp,ep = gt.betweenness(g)
92 #print(vp[g.vertex(1)])
93 #print(vp[g.vertex(2)])
94 #print(vp)
95 #print(type(vp))
96 #gt.graph_draw(g)
97 #print(vp[1],ep[1])
98 #gt.graph_draw(g.vp)
99 #gt.graph_draw(g, vertex_text=g.vertex_index, vertex_font_size=18,output_size=(200, 200), output="two-nodes.png")

File indexing.py added (mode: 100644) (index 0000000..359eb71)
1 #indexing
2 def index_nodes(self):
3 self.redis.sadd(self.node_index_key, self.nodes)
4
5 def index_neighbors(self):
6 for node in self.nodes:
7 node_neighbors = self.graph.neighbors(int(node))
8 self.redis.sadd(self.node_neighbors_prefix+str(node), node_neighbors)
9
10 def index_metrics(self):
11 for metric in self.base_metrics:
12 self.redis.sadd(self.metric_index_key, metric)
13
14 for advanced_metric in self.advanced_metrics:
15 self.redis.sadd(self.metric_index_key, advanced_metric)
16
17 def index_scores(self):
18 for score in self.scores:
19 self.redis.sadd(self.score_index_key, score)
20
21 for advanced_score in self.advanced_scores:
22 self.redis.sadd(self.score_index_key, advanced_score)

File indexing.pyc added (mode: 100644) (index 0000000..fca8491)

The diff for file log is too big (644344 changes) and cannot be shown.

File metric_calculator.py added (mode: 100644) (index 0000000..c2cc665)
1 import networkx as nx
2 import graph_tool.all as gt
3 import redis as rd
4 import numpy as np
5 import indexing
6 import statistics
7 import normalizations
8 import config
9 import datetime as dt
10
11
12 class MetricCalculator(object):
13 def __init__ (self, graph,graph_gt):
14 #class constructor
15 #define required class variables such as the graph to work on, the redis connection and the nodes of the graph
16
17 print ('Starting metric_calculator!')
18 self.graph = graph
19 self.graph_gt = graph_gt
20 # self.graph_gt_labels = graph_gt_labels
21 self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
22 self.nodes = nx.nodes(graph)
23
24
25 # configuration variables are read from the config file and are also saved to class variables for easy access
26 self.node_index_key = config.node_index_key
27 self.metric_index_key = config.metric_index_key
28 self.score_index_key = config.score_index_key
29
30 self.node_neighbors_prefix = config.node_neighbors_prefix
31 self.node_prefix = config.node_prefix
32 self.metric_prefix = config.metric_prefix
33 self.score_prefix = config.score_prefix
34 self.statistics_prefix = config.statistics_prefix
35
36 self.normalization_suffix = config.normalization_suffix
37
38 self.base_metrics = config.base_metrics
39 self.advanced_metrics = config.advanced_metrics
40
41 self.normalization_methods = config.normalization_methods
42
43 self.scores = config.scores
44 self.advanced_scores = config.advanced_scores
45
46
47
48 def start(self):
49 #clean all data in Redis
50 self.redis.flushdb()
51
52 #index creation
53 self.create_indexes()
54
55
56 #main calculations
57 self.calculate_metrics()
58 self.calculate_advanced_metrics()
59 self.normalize_metrics()
60 self.calculate_scores()
61 self.calculate_advanced_scores()
62
63 #statistics
64 self.calculate_statistics()
65
66 ##################
67 #### INDEXING ####
68 ##################
69 def create_indexes(self):
70 #call methods defined in indexing.py
71 indexing.index_nodes(self)
72 indexing.index_neighbors(self)
73 indexing.index_metrics(self)
74 indexing.index_scores(self)
75
76 ###########################
77 #### CALCULATION LOOPS ####
78 ###########################
79
80 def calculate_metrics(self):
81 # loop through all defined metrics and call specified calculation method for each node
82 print ('Starting calculate_metrics')
83 for metric_name in self.base_metrics:
84 metric_method = self.base_metrics[metric_name]
85
86 # loop through all nodes
87 for node in self.nodes:
88 # call calculation method of supplied metric for current node
89 node = int(node)
90 value = float(metric_method(self,node))
91
92 #store result in node values
93 self.redis.hset(self.node_prefix+str(node), metric_name, value)
94
95 #also store result to metric set
96 self.redis.zadd(self.metric_prefix+metric_name, value, str(node))
97
98
99 def calculate_advanced_metrics(self):
100 # loop through all defined_advanced_metrics and call specified calculation method
101 print ('Starting calculate_advanced_metrics')
102 for advanced_metric_name in self.advanced_metrics:
103 metric_method = self.advanced_metrics[advanced_metric_name]
104
105 # loop through all nodes
106 for node in self.nodes:
107 node = int(node)
108 value = float(metric_method(self,node))
109
110 #store result in node values
111 self.redis.hset(self.node_prefix+str(node), advanced_metric_name, value)
112
113 #also store result to metric set
114 self.redis.zadd(self.metric_prefix+advanced_metric_name, value, str(node))
115
116
117 # loop through all defined normalizations and call respective normalization method
118 # no default normalizations for metrics not listed in the "normalization_methods" hash
119 def normalize_metrics(self):
120 #fallback normalization: min-max
121 print ('Starting normalize_metrics')
122 all_metrics = dict(self.base_metrics.items() + self.advanced_metrics.items())
123
124 for metric_name in all_metrics:
125 if self.normalization_methods.has_key(metric_name):
126 normalization_method = self.normalization_methods[metric_name]
127 else:
128 #fallback normalization is min-max
129 normalization_method = normalizations.min_max
130 normalization_method(self,metric_name)
131
132
133 def calculate_scores(self):
134 print ('Starting calculate_scores')
135 for score_name in self.scores:
136 metrics_with_weights = self.scores[score_name]
137
138 for node in self.nodes:
139 score_value = 0.0
140
141 # get normalized values
142 for metric in metrics_with_weights:
143 weight = self.scores[score_name][metric]
144 value = float(self.redis.hget(self.node_prefix+str(node),metric+self.normalization_suffix))
145 score_value += weight * value
146
147 self.redis.hset(self.node_prefix+str(node),score_name, score_value)
148 self.redis.zadd(self.score_prefix+score_name, score_value, str(node))
149
150 def calculate_advanced_scores(self):
151 print ('Starting calculate_advanced_scores')
152 for advanced_score in self.advanced_scores:
153 self.advanced_scores[advanced_score](self)
154
155
156 #############
157 # statistics
158 #############
159
160 def calculate_statistics(self):
161 print ('Starting calculate_statistics')
162 for metric in self.base_metrics:
163 #absolute and normalized
164 statistics.calculate_statistics(self, metric, self.metric_prefix+metric)
165 statistics.calculate_statistics(self, metric+self.normalization_suffix, self.metric_prefix+metric+self.normalization_suffix)
166
167 for advanced_metric in self.advanced_metrics:
168 #absolute and normalized
169 statistics.calculate_statistics(self, advanced_metric, self.metric_prefix+advanced_metric)
170 statistics.calculate_statistics(self, advanced_metric+self.normalization_suffix, self.metric_prefix+advanced_metric+self.normalization_suffix)
171
172 for score in self.scores:
173 statistics.calculate_statistics(self, score, self.score_prefix+score)
174
175 for advanced_score in self.advanced_scores:
176 statistics.calculate_statistics(self, advanced_score, self.score_prefix+advanced_score)
177
178 statistics.calculate_correlations(self)
179

File metric_calculator.pyc added (mode: 100644) (index 0000000..4854fd8)

File metrics.py added (mode: 100644) (index 0000000..6673fc8)
1 #metrics.py
2 import networkx as nx
3 import numpy as np
4 import datetime as dt
5 import graph_tool.all as gt
6
7 def clustering_coefficient(self,node):
8 print ('Calculating clustering_coefficient for node',node)
9 #in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
10 #NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
11 # in this case, just returning nx.clustering(self.graph, node) might be better
12 if not hasattr(self, 'all_clustering_coefficients'):
13 self.all_clustering_coefficients = nx.clustering(self.graph)
14
15 #get the actual value from the pre-calculated hash
16 return self.all_clustering_coefficients[node]
17
18 def degree(self, node):
19 print('Calculating degree for node', node)
20 return self.graph.degree(node)
21
22
23 def average_neighbor_degree(self,node):
24 print('Calculating average_neighbour_degree for node',node)
25 # same caching technique as in self.clustering_coefficient
26 # might also break for very large graphs
27 # nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
28
29 if not hasattr(self, 'all_average_neighbor_degrees'):
30 self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
31 return self.all_average_neighbor_degrees[node]
32
33 def iterated_average_neighbor_degree(self, node):
34 print('Calculating iterated_average_neighbor degree for node',node)
35 first_level_neighbors = self.graph.neighbors(node)
36 # print ('First level neigbors are', first_level_neighbors)
37 second_level_neighbors = []
38 # print ('Second level neigbors are', second_level_neighbors)
39 # get all two-hop nodes
40 for first_level_neighbor in first_level_neighbors:
41 current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
42 second_level_neighbors.extend(current_second_level_neighbors)
43
44 #remove one-hop nodes and self
45 relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
46
47 degree_sum = 0
48 for relevant_node in relevant_nodes:
49 degree_sum += self.graph.degree(relevant_node)
50
51 if float(len(relevant_nodes)) <> 0:
52 return float(degree_sum)/float(len(relevant_nodes))
53 else:
54 return 0
55
56 def eccentricity(self, node):
57 print('Calculating eccentricity for node', node)
58 if not hasattr(self, 'all_eccentricities'):
59 l = gt.label_largest_component(self.graph_gt['graph_gt'],directed = None) #find the largest component
60 print ('Found the largest component')
61 # print ("Printing labeled largest component",l.a)
62 u = gt.GraphView(self.graph_gt['graph_gt'], vfilt=l) # extract the largest component as a graph
63 print('The number of vertices in the largest component is',u.num_vertices())
64 print('The number of vertices in the original graph is', nx.number_of_nodes(self.graph))
65 # if nx.is_connected(self.graph) == True:
66 if (u.num_vertices() == nx.number_of_nodes(self.graph)):
67 print ("Graph is connected")
68 self.all_eccentricities = nx.eccentricity(self.graph)
69 print ("Calculated all eccentricities")
70 # print("Eccentricities are",self.all_eccentricities)
71 return self.all_eccentricities[node]
72 else:
73 # return 0
74 print("Graph is disconnected")
75 self.all_eccentricities = {}
76 if (self.all_eccentricities != {}):
77 print("Returning eccentricity for",node,"-",self.all_eccentricities[node])
78 return self.all_eccentricities[node]
79 else:
80 print("Returning 0")
81 return 0
82
83 def betweenness_centrality(self, node):
84 print('Calculating betweenness_centrality for node',node)
85 if not hasattr(self, 'all_betweenness_centralities'):
86 self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
87 return self.all_betweenness_centralities[node]
88
89
90 def betweenness_centrality_gt(self, node):
91 print('Calculating betweenness_centrality with graph_tool for node',node)
92 # print('Self is',self.graph_gt['graph_gt'])
93 # print('Self is also',self.graph_gt['graph_gt_labels'])
94 # def convert_graph(g):
95 #converts a networkX graph to graph_tool
96 #important : NetworkX node indexes start with 1, whereas Graph tool node indexes start with 0
97 # adj = nx.adjacency_matrix(g)
98 # j = gt.Graph(directed=False)
99 # j.add_vertex(len(adj))
100 # num_vertices = adj.shape[0]
101 # for i in range(num_vertices - 1):
102 # for l in range(i + 1, num_vertices):
103 # if adj[i,l] != 0:
104 # j.add_edge(i, l)
105 # return j
106
107
108 if not hasattr(self, 'all_betweenness_centralities_gt'):
109 vp,ep = gt.betweenness(self.graph_gt['graph_gt'])
110 self.all_betweenness_centralities_gt = vp
111
112 node_label = gt.find_vertex(self.graph_gt['graph_gt'],self.graph_gt['graph_gt_labels'],node)
113 # print("Node",node,"has index",node_label)
114 # print('Vp is',vp)
115 # print('Betweenness centrality of node',node,'is',vp[self.graph_gt['graph_gt'].vertex(node_label[0])])
116
117 return self.all_betweenness_centralities_gt[self.graph_gt['graph_gt'].vertex(node_label[0])]
118
119 def average_shortest_path_length(self, node):
120 print('Calculating average_shortes_path_length for node',node)
121 # caching average_shortest_path_length for all nodes at one failed
122 # already switched to single calculation
123
124 #get all shortest path lengths
125 all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
126
127 #calculate average
128 sum_of_lengths = 0
129 for target in all_shortest_path_lengths_for_node:
130 sum_of_lengths += all_shortest_path_lengths_for_node[target]
131
132 return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
133
134
135 #############
136 # advanced metrics
137 #############
138 def correct_clustering_coefficient(self,node):
139 print('Calculating correct_clustering_coefficient for node',node)
140 clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
141 degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
142 corrected_cc = clustering_coefficient + (degree * clustering_coefficient) / float(4)
143 return corrected_cc
144
145 def correct_average_neighbor_degree(self,node):
146 print('Calculating correct_average_neighbor degree for node',node)
147 avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
148
149 neighbors = self.graph.neighbors(node)
150 number_of_neighbors = float(len(neighbors))
151 neighbor_degrees = []
152 for neighbor in neighbors:
153 neighbor_degrees.append(self.graph.degree(neighbor))
154
155 #using numpy median and standard deviation implementation
156 numpy_neighbor_degrees = np.array(neighbor_degrees)
157 median = np.median(numpy_neighbor_degrees)
158 standard_deviation = np.std(numpy_neighbor_degrees)
159
160 if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
161 return avgnd
162 else:
163 return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
164
165
166 def correct_iterated_average_neighbor_degree(self, node):
167 print('Calculating correct_iterated_avverage_neighbour_gegree for node',node)
168 avgnd = float(self.redis.hget(self.node_prefix+str(node), 'iterated_average_neighbor_degree'))
169
170 first_level_neighbors = self.graph.neighbors(node)
171 second_level_neighbors = []
172
173 # get all two-hop nodes
174 for first_level_neighbor in first_level_neighbors:
175 current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
176 second_level_neighbors.extend(current_second_level_neighbors)
177
178 #remove one-hop neighbors and self
179 relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
180
181 number_of_nodes = len(relevant_nodes)
182 node_degrees = []
183 for rel_node in relevant_nodes:
184 node_degrees.append(self.graph.degree(rel_node))
185
186 numpy_node_degrees = np.array(node_degrees)
187 median = np.median(numpy_node_degrees)
188 standard_deviation = np.std(numpy_node_degrees)
189
190 if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
191 return avgnd
192 else:
193 return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
194
195

File metrics.pyc added (mode: 100644) (index 0000000..0700311)

File normalizations.py added (mode: 100644) (index 0000000..a959a8c)
1 #normalizations.py
2 def min_max(self,metric_name):
3 #perform min max normalization of specified metric for all nodes
4 #min_max normalization
5 #get min and max from redis
6 x_min = self.redis.zrange(self.metric_prefix+metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
7 x_max = self.redis.zrange(self.metric_prefix+metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
8
9 #print x_min
10 #print x_max
11
12 for node in self.nodes:
13 if x_min == x_max:
14 x_normalized = 1.0
15 else:
16 x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
17 x_normalized = (x - x_min) / (x_max - x_min)
18
19 #store value for node and metric
20 self.redis.zadd(self.metric_prefix+metric_name+self.normalization_suffix, x_normalized, str(node))
21 self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
22
23 #max min normalization
24 def max_min(self,metric_name):
25 x_min = self.redis.zrange(self.metric_prefix+metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
26 x_max = self.redis.zrange(self.metric_prefix+metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
27
28 for node in self.nodes:
29 if x_min == x_max:
30 x_normalized = 1.0
31 else:
32 x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
33 x_normalized = (x_max - x) / (x_max - x_min)
34
35 #store value for node and metric
36 self.redis.zadd(self.metric_prefix+metric_name+self.normalization_suffix, x_normalized, str(node))
37 self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)

File normalizations.pyc added (mode: 100644) (index 0000000..b814a04)

File pearson.py added (mode: 100644) (index 0000000..7a6cc1c)
1 import redis as rd
2 import numpy as np
3 from scipy.stats import pearsonr
4
5 metrics = ['clustering_coefficient',
6 'degree',
7 'average_neighbor_degree',
8 'iterated_average_neighbor_degree',
9 'betweenness_centrality',
10 'eccentricity',
11 'average_shortest_path_length',
12 'corrected_clustering_coefficient',
13 'corrected_average_neighbor_degree',
14 'corrected_iterated_average_neighbor_degree']
15
16 rdb = rd.StrictRedis(host='localhost', port=6379, db=0)
17
18
19 correlations = {}
20 for metric1 in metrics:
21 correlations[metric1] = {}
22 for metric2 in metrics:
23 correlations[metric1][metric2] = (0,0)
24 if metric1 == metric2:
25 correlations[metric1][metric2] = (1,0)
26 continue
27
28 dict_metric1 = dict(rdb.zrange(metric1, 0, -1, withscores=True, score_cast_func=float))
29 dict_metric2 = dict(rdb.zrange(metric2, 0, -1, withscores=True, score_cast_func=float))
30
31 values_metric1 = []
32 values_metric2 = []
33
34 for key in sorted(dict_metric1.iterkeys()):
35 values_metric1.append(dict_metric1[key])
36
37 for key in sorted(dict_metric2.iterkeys()):
38 values_metric2.append(dict_metric2[key])
39
40 correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
41
42 for source in correlations:
43 for target in correlations[source]:
44 rdb.hset("correlations:"+source+":"+target, "correlation", correlations[source][target][0])
45 rdb.hset("correlations:"+source+":"+target, "confidence", correlations[source][target][1])

File start.py added (mode: 100644) (index 0000000..26df05e)
1 #!/usr/bin/env python
2 import datetime
3 import argparse
4 import cProfile, pstats, StringIO
5 from file_importer import FileImporter
6 from metric_calculator import MetricCalculator
7 import datetime as dt
8
9 print 'Starting metric calculation',dt.datetime.now()
10 parser = argparse.ArgumentParser(description='Read a Tab-separated Graph Datafile and start Calculation of Metrics and Statistics as configured in config.py')
11
12 parser.add_argument('filename', metavar='filename', type=str,
13 help='the name of the data file containing tab separated node ids')
14
15 parser.add_argument('--profiling',dest='profiling',action='store_true', help='enable runtime profiling into profiling.txt file')
16
17 args = parser.parse_args()
18
19 if args.profiling:
20 pr = cProfile.Profile()
21 s = StringIO.StringIO()
22 timestamp = str(datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))
23 outfile = open('profiling_output_'+timestamp+'.txt', 'w')
24 pr.enable()
25
26 fi = FileImporter(args.filename)
27 graph = fi.read()
28 #print('This should be a Network X graph',graph)
29 print('Network X graph has the following number of nodes',graph.number_of_nodes())
30 print('Network X graph has the following number of edges',graph.number_of_edges())
31 graph_gt = fi.read_gt()
32 print('Graph tool graph has the following number of nodes',graph_gt['graph_gt'].num_vertices())
33 print('Graph tool graph has the following number of edges',graph_gt['graph_gt'].num_edges())
34 #print('Gt graph has the following properties')
35 mc = MetricCalculator(graph,graph_gt)
36 mc.start()
37
38 if args.profiling:
39 ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
40 ps.print_stats()
41 outfile.write(s.getvalue())
42
43 print 'Ending metric calculation',dt.datetime.now()

File statistics.py added (mode: 100644) (index 0000000..fb03eaa)
1 #statistics.py
2 import redis as rd
3 import numpy as np
4 from scipy.stats import pearsonr
5
6 def calculate_statistics(self,metric,redis_key):
7 all_values = dict(self.redis.zrange(redis_key, 0, -1, withscores=True, score_cast_func=float)).values()
8 min_value = np.min(all_values)
9 max_value = np.max(all_values)
10
11 average = np.average(all_values)
12 median = np.median(all_values)
13 standard_deviation = np.std(all_values)
14
15 self.redis.hset(self.statistics_prefix+metric, 'min', min_value)
16 self.redis.hset(self.statistics_prefix+metric, 'max', max_value)
17 self.redis.hset(self.statistics_prefix+metric, 'average', average)
18 self.redis.hset(self.statistics_prefix+metric, 'median', median)
19 self.redis.hset(self.statistics_prefix+metric, 'standard_deviation', standard_deviation)
20
21
22 def calculate_correlations(self):
23 m = self.base_metrics.keys()
24 c = self.advanced_metrics.keys()
25
26 metrics = m + c
27
28 correlations = {}
29 for metric1 in metrics:
30 correlations[metric1] = {}
31 for metric2 in metrics:
32 correlations[metric1][metric2] = (0,0)
33 if metric1 == metric2:
34 correlations[metric1][metric2] = (1,0)
35 continue
36
37 dict_metric1 = dict(self.redis.zrange(self.metric_prefix+metric1, 0, -1, withscores=True, score_cast_func=float))
38 dict_metric2 = dict(self.redis.zrange(self.metric_prefix+metric2, 0, -1, withscores=True, score_cast_func=float))
39 values_metric1 = []
40 values_metric2 = []
41
42 for key in sorted(dict_metric1.iterkeys()):
43 values_metric1.append(dict_metric1[key])
44
45 for key in sorted(dict_metric2.iterkeys()):
46 values_metric2.append(dict_metric2[key])
47
48 correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
49
50 values_metric1 = []
51 values_metric2 = []
52
53 for source in correlations:
54 for target in correlations[source]:
55 self.redis.hset(self.statistics_prefix+"correlations:"+source+":"+target, "correlation", correlations[source][target][0])
56 self.redis.hset(self.statistics_prefix+"correlations:"+source+":"+target, "confidence", correlations[source][target][1])

File statistics.pyc added (mode: 100644) (index 0000000..b85b89d)
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/coria/coria-backend

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/coria/coria-backend

Clone this repository using git:
git clone git://git.rocketgit.com/user/coria/coria-backend

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a pull request:
... clone the repository ...
... make some changes and some commits ...
git push origin master