List of commits:
Subject Hash Author Date (UTC)
a lot of refactoring for more modular structure 660e0d15e9b18aa9c1100c874e2140220f1c5860 mcehlert 2014-01-09 15:51:02
a lot of refactoring for more modular structure dcf7bd73ccc2f871ab8d48c43d11a8e5b392b6de mcehlert 2014-01-09 15:50:53
initial commit - pre colloquim state 655c77556f9d8e40b52893887cdb0d90f726fdbf Mathias Ehlert 2013-11-22 13:47:29
Initial commit f53ec7a3f25d55c53aa12c2682b216e16570cdc7 Mathias Ehlert 2013-11-22 13:37:47
Commit 660e0d15e9b18aa9c1100c874e2140220f1c5860 - a lot of refactoring for more modular structure
Author: mcehlert
Author date (UTC): 2014-01-09 15:51
Committer name: mcehlert
Committer date (UTC): 2014-01-09 15:51
Parent(s): dcf7bd73ccc2f871ab8d48c43d11a8e5b392b6de
Signing key:
Tree: 2264e76ef92d6137175da965e24a23d07d62f268
File Lines added Lines deleted
advancedscores.py 33 0
config.py 81 0
indexing.py 22 0
metrics.py 129 0
normalizations.py 37 0
pearson.py 45 0
start.py 16 0
statistics.py 56 0
File advancedscores.py added (mode: 100644) (index 0000000..489636a)
1 # advancedscores.py
2 import numpy as np
3
4 ################
5 #advanced scores
6 ################
7
8 def adv_unified_risk_score(self):
9
10 #caching of all values in dictionaries
11 all_ccs_normalized = dict(self.redis.zrange(self.metric_prefix+'corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
12 all_urs = dict(self.redis.zrange(self.score_prefix+'unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
13
14 urs_percentile_10 = np.percentile(all_urs.values(), 10)
15 urs_percentile_90 = np.percentile(all_urs.values(), 90)
16
17 for node in self.nodes:
18 cc_normalized = all_ccs_normalized[str(node)]
19 urs = all_urs[str(node)]
20
21
22 if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
23 if (cc_normalized >= 0.25):
24 advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
25 else:
26 advanced_unified_risk_score = urs
27 else:
28 advanced_unified_risk_score = urs
29
30 #save for node
31 self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
32 #save for score
33 self.redis.zadd(self.score_prefix+'advanced_unified_risk_score', advanced_unified_risk_score, str(node))
File config.py added (mode: 100644) (index 0000000..d4c8c5e)
1 #config.py
2 import metrics
3 import normalizations
4 import advancedscores
5
6 node_index_key = 'all_nodes'
7 metric_index_key = 'all_metrics'
8 score_index_key = 'all_scores'
9
10 node_neighbors_prefix = 'node_neighbors:'
11 node_prefix = 'node_metrics:'
12 metric_prefix = 'metric:'
13 score_prefix = 'score:'
14 statistics_prefix = 'statistics:'
15
16 normalization_suffix = '_normalized'
17
18 # definition of all base metrics for which absolute values will be calculcated for each node in the first step
19 # key is the name of the metric and value is the implemented method which exposes the required interface
20 # interface: each method takes the node as the single parameter, performs the necessary calculation and
21 # returns a float containing the value for the specified node
22
23 base_metrics = { 'clustering_coefficient' : metrics.clustering_coefficient,
24 'degree' : metrics.degree,
25 'average_neighbor_degree' : metrics.average_neighbor_degree,
26 'iterated_average_neighbor_degree': metrics.iterated_average_neighbor_degree,
27 'betweenness_centrality' : metrics.betweenness_centrality,
28 'eccentricity' : metrics.eccentricity,
29 'average_shortest_path_length' : metrics.average_shortest_path_length
30 }
31
32
33 # some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
34 # key is the metric name and value the method for correction
35
36 advanced_metrics = {'corrected_clustering_coefficient' : metrics.correct_clustering_coefficient,
37 'corrected_average_neighbor_degree' : metrics.correct_average_neighbor_degree,
38 'corrected_iterated_average_neighbor_degree': metrics.correct_iterated_average_neighbor_degree}
39
40
41 # for every metric, a normalization method has to be specified
42 # key is the name of the metric and value is the normalization method which also has to expose the required interface
43 # interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
44 # the method itself shall access the data which is required for normalization from the redis instance
45 # and the corresponding keys/values for the specified metric
46 # it shall then loop over all nodes and calculate the normalized value for the node and the metric
47 # afterwards it should save the result to redis using "metric_name_normalized" as the key
48 # the result is stored inside the node's hash for metrics
49
50 # also needs to include corrected metrics with their respective names
51 #
52 normalization_methods = { 'clustering_coefficient' : normalizations.min_max,
53 'corrected_clustering_coefficient' : normalizations.min_max,
54 'degree' : normalizations.min_max,
55 'average_neighbor_degree' : normalizations.min_max,
56 'corrected_average_neighbor_degree' : normalizations.min_max,
57 'iterated_average_neighbor_degree' : normalizations.min_max,
58 'corrected_iterated_average_neighbor_degree': normalizations.min_max,
59 'betweenness_centrality' : normalizations.min_max,
60 'eccentricity' : normalizations.max_min,
61 'average_shortest_path_length' : normalizations.max_min
62 }
63
64
65 # the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
66 # such scores can easily be defined here
67 # note: names are not methods but redis keys
68
69 scores = {'unified_risk_score': { 'degree': 0.25,
70 'corrected_average_neighbor_degree': 0.15,
71 'corrected_iterated_average_neighbor_degree': 0.1,
72 'betweenness_centrality': 0.25,
73 'eccentricity': 0.125,
74 'average_shortest_path_length': 0.125}
75 }
76
77
78 # other scores might require a more sophisticated algorithm to be calculated
79 # such scores need to be added here and implemented like the example below
80
81 advanced_scores = {'advanced_unified_risk_score': advancedscores.adv_unified_risk_score}
File indexing.py added (mode: 100644) (index 0000000..359eb71)
1 #indexing
2 def index_nodes(self):
3 self.redis.sadd(self.node_index_key, self.nodes)
4
5 def index_neighbors(self):
6 for node in self.nodes:
7 node_neighbors = self.graph.neighbors(int(node))
8 self.redis.sadd(self.node_neighbors_prefix+str(node), node_neighbors)
9
10 def index_metrics(self):
11 for metric in self.base_metrics:
12 self.redis.sadd(self.metric_index_key, metric)
13
14 for advanced_metric in self.advanced_metrics:
15 self.redis.sadd(self.metric_index_key, advanced_metric)
16
17 def index_scores(self):
18 for score in self.scores:
19 self.redis.sadd(self.score_index_key, score)
20
21 for advanced_score in self.advanced_scores:
22 self.redis.sadd(self.score_index_key, advanced_score)
File metrics.py added (mode: 100644) (index 0000000..d0b9c8e)
1 #metrics.py
2 import networkx as nx
3 import numpy as np
4
5 def clustering_coefficient(self,node):
6 #in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
7 #NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
8 # in this case, just returning nx.clustering(self.graph, node) might be better
9 if not hasattr(self, 'all_clustering_coefficients'):
10 self.all_clustering_coefficients = nx.clustering(self.graph)
11
12 #get the actual value from the pre-calculated hash
13 return self.all_clustering_coefficients[node]
14
15 def degree(self, node):
16 return self.graph.degree(node)
17
18
19 def average_neighbor_degree(self,node):
20 # same caching technique as in self.clustering_coefficient
21 # might also break for very large graphs
22 # nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
23
24 if not hasattr(self, 'all_average_neighbor_degrees'):
25 self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
26 return self.all_average_neighbor_degrees[node]
27
28 def iterated_average_neighbor_degree(self, node):
29
30 first_level_neighbors = self.graph.neighbors(node)
31 second_level_neighbors = []
32
33 # get all two-hop nodes
34 for first_level_neighbor in first_level_neighbors:
35 current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
36 second_level_neighbors.extend(current_second_level_neighbors)
37
38 #remove one-hop nodes and self
39 relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
40
41 degree_sum = 0
42 for relevant_node in relevant_nodes:
43 degree_sum += self.graph.degree(relevant_node)
44
45 return float(degree_sum)/float(len(relevant_nodes))
46
47 def betweenness_centrality(self, node):
48 if not hasattr(self, 'all_betweenness_centralities'):
49 self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
50 return self.all_betweenness_centralities[node]
51
52 def eccentricity(self, node):
53 if not hasattr(self, 'all_eccentricities'):
54 self.all_eccentricities = nx.eccentricity(self.graph)
55 return self.all_eccentricities[node]
56
57 def average_shortest_path_length(self, node):
58 # caching average_shortest_path_length for all nodes at one failed
59 # already switched to single calculation
60
61 #get all shortest path lengths
62 all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
63
64 #calculate average
65 sum_of_lengths = 0
66 for target in all_shortest_path_lengths_for_node:
67 sum_of_lengths += all_shortest_path_lengths_for_node[target]
68
69 return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
70
71
72 #############
73 # advanced metrics
74 #############
75 def correct_clustering_coefficient(self,node):
76 clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
77 degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
78 corrected_cc = clustering_coefficient + (degree * clustering_coefficient) / float(4)
79 return corrected_cc
80
81 def correct_average_neighbor_degree(self,node):
82 avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
83
84 neighbors = self.graph.neighbors(node)
85 number_of_neighbors = float(len(neighbors))
86 neighbor_degrees = []
87 for neighbor in neighbors:
88 neighbor_degrees.append(self.graph.degree(neighbor))
89
90 #using numpy median and standard deviation implementation
91 numpy_neighbor_degrees = np.array(neighbor_degrees)
92 median = np.median(numpy_neighbor_degrees)
93 standard_deviation = np.std(numpy_neighbor_degrees)
94
95 if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
96 return avgnd
97 else:
98 return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
99
100
101 def correct_iterated_average_neighbor_degree(self, node):
102 avgnd = float(self.redis.hget(self.node_prefix+str(node), 'iterated_average_neighbor_degree'))
103
104 first_level_neighbors = self.graph.neighbors(node)
105 second_level_neighbors = []
106
107 # get all two-hop nodes
108 for first_level_neighbor in first_level_neighbors:
109 current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
110 second_level_neighbors.extend(current_second_level_neighbors)
111
112 #remove one-hop neighbors and self
113 relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
114
115 number_of_nodes = len(relevant_nodes)
116 node_degrees = []
117 for rel_node in relevant_nodes:
118 node_degrees.append(self.graph.degree(rel_node))
119
120 numpy_node_degrees = np.array(node_degrees)
121 median = np.median(numpy_node_degrees)
122 standard_deviation = np.std(numpy_node_degrees)
123
124 if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
125 return avgnd
126 else:
127 return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
128
129
File normalizations.py added (mode: 100644) (index 0000000..a959a8c)
1 #normalizations.py
2 def min_max(self,metric_name):
3 #perform min max normalization of specified metric for all nodes
4 #min_max normalization
5 #get min and max from redis
6 x_min = self.redis.zrange(self.metric_prefix+metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
7 x_max = self.redis.zrange(self.metric_prefix+metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
8
9 #print x_min
10 #print x_max
11
12 for node in self.nodes:
13 if x_min == x_max:
14 x_normalized = 1.0
15 else:
16 x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
17 x_normalized = (x - x_min) / (x_max - x_min)
18
19 #store value for node and metric
20 self.redis.zadd(self.metric_prefix+metric_name+self.normalization_suffix, x_normalized, str(node))
21 self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
22
23 #max min normalization
24 def max_min(self,metric_name):
25 x_min = self.redis.zrange(self.metric_prefix+metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
26 x_max = self.redis.zrange(self.metric_prefix+metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
27
28 for node in self.nodes:
29 if x_min == x_max:
30 x_normalized = 1.0
31 else:
32 x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
33 x_normalized = (x_max - x) / (x_max - x_min)
34
35 #store value for node and metric
36 self.redis.zadd(self.metric_prefix+metric_name+self.normalization_suffix, x_normalized, str(node))
37 self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
File pearson.py added (mode: 100644) (index 0000000..7a6cc1c)
1 import redis as rd
2 import numpy as np
3 from scipy.stats import pearsonr
4
5 metrics = ['clustering_coefficient',
6 'degree',
7 'average_neighbor_degree',
8 'iterated_average_neighbor_degree',
9 'betweenness_centrality',
10 'eccentricity',
11 'average_shortest_path_length',
12 'corrected_clustering_coefficient',
13 'corrected_average_neighbor_degree',
14 'corrected_iterated_average_neighbor_degree']
15
16 rdb = rd.StrictRedis(host='localhost', port=6379, db=0)
17
18
19 correlations = {}
20 for metric1 in metrics:
21 correlations[metric1] = {}
22 for metric2 in metrics:
23 correlations[metric1][metric2] = (0,0)
24 if metric1 == metric2:
25 correlations[metric1][metric2] = (1,0)
26 continue
27
28 dict_metric1 = dict(rdb.zrange(metric1, 0, -1, withscores=True, score_cast_func=float))
29 dict_metric2 = dict(rdb.zrange(metric2, 0, -1, withscores=True, score_cast_func=float))
30
31 values_metric1 = []
32 values_metric2 = []
33
34 for key in sorted(dict_metric1.iterkeys()):
35 values_metric1.append(dict_metric1[key])
36
37 for key in sorted(dict_metric2.iterkeys()):
38 values_metric2.append(dict_metric2[key])
39
40 correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
41
42 for source in correlations:
43 for target in correlations[source]:
44 rdb.hset("correlations:"+source+":"+target, "correlation", correlations[source][target][0])
45 rdb.hset("correlations:"+source+":"+target, "confidence", correlations[source][target][1])
File start.py added (mode: 100644) (index 0000000..29e5255)
1 #!/usr/bin/env python
2 import argparse
3 from file_importer import FileImporter
4 from metric_calculator import MetricCalculator
5
6 parser = argparse.ArgumentParser(description='Read a Tab-separated Graph Datafile and start Calculation of Metrics and Statistics as configured in config.py')
7
8 parser.add_argument('filename', metavar='filename', type=str,
9 help='the name of the data file containing tab separated node ids')
10
11 args = parser.parse_args()
12
13 fi = FileImporter(args.filename)
14 graph = fi.read()
15 mc = MetricCalculator(graph)
16 mc.start()
File statistics.py added (mode: 100644) (index 0000000..fb03eaa)
1 #statistics.py
2 import redis as rd
3 import numpy as np
4 from scipy.stats import pearsonr
5
6 def calculate_statistics(self,metric,redis_key):
7 all_values = dict(self.redis.zrange(redis_key, 0, -1, withscores=True, score_cast_func=float)).values()
8 min_value = np.min(all_values)
9 max_value = np.max(all_values)
10
11 average = np.average(all_values)
12 median = np.median(all_values)
13 standard_deviation = np.std(all_values)
14
15 self.redis.hset(self.statistics_prefix+metric, 'min', min_value)
16 self.redis.hset(self.statistics_prefix+metric, 'max', max_value)
17 self.redis.hset(self.statistics_prefix+metric, 'average', average)
18 self.redis.hset(self.statistics_prefix+metric, 'median', median)
19 self.redis.hset(self.statistics_prefix+metric, 'standard_deviation', standard_deviation)
20
21
22 def calculate_correlations(self):
23 m = self.base_metrics.keys()
24 c = self.advanced_metrics.keys()
25
26 metrics = m + c
27
28 correlations = {}
29 for metric1 in metrics:
30 correlations[metric1] = {}
31 for metric2 in metrics:
32 correlations[metric1][metric2] = (0,0)
33 if metric1 == metric2:
34 correlations[metric1][metric2] = (1,0)
35 continue
36
37 dict_metric1 = dict(self.redis.zrange(self.metric_prefix+metric1, 0, -1, withscores=True, score_cast_func=float))
38 dict_metric2 = dict(self.redis.zrange(self.metric_prefix+metric2, 0, -1, withscores=True, score_cast_func=float))
39 values_metric1 = []
40 values_metric2 = []
41
42 for key in sorted(dict_metric1.iterkeys()):
43 values_metric1.append(dict_metric1[key])
44
45 for key in sorted(dict_metric2.iterkeys()):
46 values_metric2.append(dict_metric2[key])
47
48 correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
49
50 values_metric1 = []
51 values_metric2 = []
52
53 for source in correlations:
54 for target in correlations[source]:
55 self.redis.hset(self.statistics_prefix+"correlations:"+source+":"+target, "correlation", correlations[source][target][0])
56 self.redis.hset(self.statistics_prefix+"correlations:"+source+":"+target, "confidence", correlations[source][target][1])
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/coria/coria-backend

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/coria/coria-backend

Clone this repository using git:
git clone git://git.rocketgit.com/user/coria/coria-backend

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main