RocketGit

coria / coria-backend (public) (License: Unspecified) (since 2017-02-23) (hash sha1)

No description available

Clone URLs: https://rocketgit.com/user/coria/coria-backend ssh://rocketgit@ssh.rocketgit.com/user/coria/coria-backend git://git.rocketgit.com/user/coria/coria-backend

feature/coria-ver1 feature/coria-ver1.5 feature/coria-ver2 master

List of commits:

Subject	Hash	Author	Date (UTC)
a lot of refactoring for more modular structure	dcf7bd73ccc2f871ab8d48c43d11a8e5b392b6de	mcehlert	2014-01-09 15:50:53
initial commit - pre colloquim state	655c77556f9d8e40b52893887cdb0d90f726fdbf	Mathias Ehlert	2013-11-22 13:47:29
Initial commit	f53ec7a3f25d55c53aa12c2682b216e16570cdc7	Mathias Ehlert	2013-11-22 13:37:47

Commit dcf7bd73ccc2f871ab8d48c43d11a8e5b392b6de - a lot of refactoring for more modular structure
Author: mcehlert
Author date (UTC): 2014-01-09 15:50
Committer name: mcehlert
Committer date (UTC): 2014-01-09 15:50
Parent(s): 655c77556f9d8e40b52893887cdb0d90f726fdbf
Signer:
Signing key:
Signing status: N
Tree: 8c62df349128ce021e0230ab8e76dc0fdc363c98

File	Lines added	Lines deleted
file_importer.py	9	11
file_importer.pyc	0	0
metric_calculator.py	483	319
metric_calculator.pyc	0	0
profiling.py	0	43
run.py	0	40
test.py	0	17

File file_importer.py changed (mode: 100644) (index 9796f9f..3600618)
1	1	import networkx as nx	import networkx as nx
2		import redis as rd
3	2
4	3	class FileImporter(object):	class FileImporter(object):
	4
5	5	def __init__(self,filename):	def __init__(self,filename):
	6		# initialize data file to parse and new empty graph
	7
6	8	self.data_file = open(filename)	self.data_file = open(filename)
7		self.all_nodes = []
8		self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
9	9	self.graph = nx.Graph()	self.graph = nx.Graph()
10	10
11	11	def read(self):	def read(self):
12		self.redis.flushdb()
13	12	for line in self.data_file:	for line in self.data_file:
14	13	self.parse_line(line)	self.parse_line(line)
15		self.save_all_nodes()
16	14	return self.graph	return self.graph
17	15
18	16	def parse_line(self, line):	def parse_line(self, line):
	17		# split each line on tabstop
	18		# first field specifies the source node
	19		# second field specifies the target node
	20
19	21	fields = line.strip().split("\t")	fields = line.strip().split("\t")
20	22	from_node = int(fields[0])	from_node = int(fields[0])
21	23	to_node = int(fields[1])	to_node = int(fields[1])
22		self.all_nodes.extend([from_node,to_node])
23		self.graph.add_edge(from_node, to_node)
24	24
25		def save_all_nodes(self):
26		self.unique_nodes = list(set(self.all_nodes))
27		self.unique_nodes.sort()
28		self.redis.sadd('all_nodes', *self.unique_nodes)
	25		# add edge to the graph
	26		self.graph.add_edge(from_node, to_node)

File file_importer.pyc deleted (index 9e675cc..0000000)

File metric_calculator.py changed (mode: 100644) (index a573c4e..281cf20)
1	1	import networkx as nx	import networkx as nx
2	2	import redis as rd	import redis as rd
3	3	import numpy as np	import numpy as np
	4		import indexing
	5		import statistics
	6		import normalizations
	7		import config
4	8
5	9
6	10	class MetricCalculator(object):	class MetricCalculator(object):

...	...	class MetricCalculator(object):
8	12	self.graph = graph	self.graph = graph
9	13	self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)	self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
10	14	self.nodes = nx.nodes(graph)	self.nodes = nx.nodes(graph)
11
12		self.node_neighbors_prefix = 'node_neighbors:'
13		self.node_prefix = 'node_metrics:'
14		self.normalization_suffix = '_normalized'
15
16		# definition of all base metrics for which absolute values will be calculcated for each node in the first step
17		# key is the name of the metric and value is the implemented method which exposes the required interface
18		# interface: each method takes the node as the single parameter, performs the necessary calculation and
19		# returns a float containing the value for the specified node
20
21		self.metrics = { 'clustering_coefficient' : self.clustering_coefficient,
22		'degree' : self.degree,
23		'average_neighbor_degree' : self.average_neighbor_degree,
24		'iterated_average_neighbor_degree': self.iterated_average_neighbor_degree,
25		'betweenness_centrality' : self.betweenness_centrality,
26		'eccentricity' : self.eccentricity,
27		'average_shortest_path_length' : self.average_shortest_path_length
28		}
29
30
31		# for the frontend
32		# self.metric_names = {
33		# 'clustering_coefficient' : 'Clustering Coefficient',
34		# 'degree' : 'Node Degree',
35		# 'average_neighbor_degree' : 'Average Neighbor Node Degree',
36		# 'iterated_average_neighbor_degree': 'Iterated Average Neighbor Node Degree',
37		# 'betweenness_centrality' : 'Betweenness Centrality',
38		# 'eccentricity' : 'Node Eccentricity',
39		# 'average_shortest_path_length' : 'Average Shortest Path Length'
40		# }
41
42
43		# some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
44		# key is the metric name and value the method for correction
45
46
47		self.corrections = {'corrected_clustering_coefficient' : self.correct_clustering_coefficient,
48		'corrected_average_neighbor_degree' : self.correct_average_neighbor_degree,
49		'corrected_iterated_average_neighbor_degree': self.correct_iterated_average_neighbor_degree}
50
51
52
53		# for every metric, a normalization method has to be specified
54		# key is the name of the metric and value is the normalization method which also has to expose the required interface
55		# interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
56		# the method itself shall access the data which is required for normalization from the redis instance
57		# and the corresponding keys/values for the specified metric
58		# it shall then loop over all nodes and calculate the normalized value for the node and the metric
59		# afterwards it should save the result to redis using "metric_name_normalized" as the key
60		# the result is stored inside the node's hash for metrics
61
62		# also needs to include corrected metrics with their respective names
63		#
64		self.normalization_methods = { 'clustering_coefficient' : self.min_max_normalization,
65		'corrected_clustering_coefficient' : self.min_max_normalization,
66		'degree' : self.min_max_normalization,
67		'average_neighbor_degree' : self.min_max_normalization,
68		'corrected_average_neighbor_degree' : self.min_max_normalization,
69		'iterated_average_neighbor_degree' : self.min_max_normalization,
70		'corrected_iterated_average_neighbor_degree': self.min_max_normalization,
71		'betweenness_centrality' : self.min_max_normalization,
72		'eccentricity' : self.inverse_min_max_normalization,
73		'average_shortest_path_length' : self.inverse_min_max_normalization
74		}
75
76
77		# the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
78		# such scores can easily be defined here
79
80		#self.scores = ['unified_risk_score']
81
82		self.scores = {'unified_risk_score': { #'corrected_clustering_coefficient': 0.2,
83		'degree_normalized': 0.25,
84		'corrected_average_neighbor_degree_normalized': 0.15,
85		'corrected_iterated_average_neighbor_degree_normalized': 0.1,
86		'betweenness_centrality_normalized': 0.25,
87		'eccentricity_normalized': 0.125,
88		'average_shortest_path_length_normalized': 0.125}
89		}
90	15
	16		self.node_index_key = config.node_index_key
	17		self.metric_index_key = config.metric_index_key
	18		self.score_index_key = config.score_index_key
	19
	20		self.node_neighbors_prefix = config.node_neighbors_prefix
	21		self.node_prefix = config.node_prefix
	22		self.metric_prefix = config.metric_prefix
	23		self.score_prefix = config.score_prefix
	24		self.statistics_prefix = config.statistics_prefix
	25
	26		self.normalization_suffix = config.normalization_suffix
	27
	28		self.base_metrics = config.base_metrics
	29		self.advanced_metrics = config.advanced_metrics
	30
	31		self.normalization_methods = config.normalization_methods
	32
	33		self.scores = config.scores
	34		self.advanced_scores = config.advanced_scores
	35
	36
	37
	38		# self.node_index_key = 'all_nodes'
	39		# self.metric_index_key = 'all_metrics'
	40		# self.score_index_key = 'all_scores'
	41		#
	42		# self.node_neighbors_prefix = 'node_neighbors:'
	43		# self.node_prefix = 'node_metrics:'
	44		# self.metric_prefix = 'metric:'
	45		# self.statistics_prefix = 'statistics:'
	46		#
	47		# self.normalization_suffix = '_normalized'
	48		#
	49		# # definition of all base metrics for which absolute values will be calculcated for each node in the first step
	50		# # key is the name of the metric and value is the implemented method which exposes the required interface
	51		# # interface: each method takes the node as the single parameter, performs the necessary calculation and
	52		# # returns a float containing the value for the specified node
	53		#
	54		# self.metrics = { 'clustering_coefficient' : self.clustering_coefficient,
	55		# 'degree' : self.degree,
	56		# 'average_neighbor_degree' : self.average_neighbor_degree,
	57		# 'iterated_average_neighbor_degree': self.iterated_average_neighbor_degree,
	58		# 'betweenness_centrality' : self.betweenness_centrality,
	59		# 'eccentricity' : self.eccentricity,
	60		# 'average_shortest_path_length' : self.average_shortest_path_length
	61		# }
	62		#
	63		#
	64		# # some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
	65		# # key is the metric name and value the method for correction
	66		#
	67		#
	68		# self.advanced_metrics = { 'corrected_clustering_coefficient' : self.correct_clustering_coefficient,
	69		# 'corrected_average_neighbor_degree' : self.correct_average_neighbor_degree,
	70		# 'corrected_iterated_average_neighbor_degree': self.correct_iterated_average_neighbor_degree}
	71		#
	72		#
	73		#
	74		# # for every metric, a normalization method has to be specified
	75		# # key is the name of the metric and value is the normalization method which also has to expose the required interface
	76		# # interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
	77		# # the method itself shall access the data which is required for normalization from the redis instance
	78		# # and the corresponding keys/values for the specified metric
	79		# # it shall then loop over all nodes and calculate the normalized value for the node and the metric
	80		# # afterwards it should save the result to redis using "metric_name_normalized" as the key
	81		# # the result is stored inside the node's hash for metrics
	82		#
	83		# # also needs to include corrected metrics with their respective names
	84		# #
	85		# self.normalization_methods = { 'clustering_coefficient' : self.min_max_normalization,
	86		# 'corrected_clustering_coefficient' : self.min_max_normalization,
	87		# 'degree' : self.min_max_normalization,
	88		# 'average_neighbor_degree' : self.min_max_normalization,
	89		# 'corrected_average_neighbor_degree' : self.min_max_normalization,
	90		# 'iterated_average_neighbor_degree' : self.min_max_normalization,
	91		# 'corrected_iterated_average_neighbor_degree': self.min_max_normalization,
	92		# 'betweenness_centrality' : self.min_max_normalization,
	93		# 'eccentricity' : self.inverse_min_max_normalization,
	94		# 'average_shortest_path_length' : self.inverse_min_max_normalization
	95		# }
	96		#
	97		#
	98		# # the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
	99		# # such scores can easily be defined here
	100		# # note: names are not methods but redis keys
	101		#
	102		# self.scores = {'unified_risk_score': { #'corrected_clustering_coefficient': 0.2,
	103		# 'degree_normalized': 0.25,
	104		# 'corrected_average_neighbor_degree_normalized': 0.15,
	105		# 'corrected_iterated_average_neighbor_degree_normalized': 0.1,
	106		# 'betweenness_centrality_normalized': 0.25,
	107		# 'eccentricity_normalized': 0.125,
	108		# 'average_shortest_path_length_normalized': 0.125}
	109		# }
	110		#
	111		#
	112		# # other scores might require a more sophisticated algorithm to be calculated
	113		# # such scores need to be added here and implemented like the example below
	114		#
	115		# self.advanced_scores = {'advanced_unified_risk_score': self.urs_clustering_coefficient_modification}
91	116
92		# other scores might require a more sophisticated algorithm to be calculated
93		# such scores need to be added here and implemented like the example below
94	117
95		self.advanced_scores = {'advanced_unified_risk_score': self.urs_clustering_coefficient_modification}
96	118
97	119
98	120
99	121	def start(self):	def start(self):
	122		#clean all data in Redis
	123		self.redis.flushdb()
	124
	125		#index creation
	126		#self.index_nodes()
	127		#self.index_neighbors()
	128		#self.index_metrics()
	129		#self.index_scores()
	130
	131		self.create_indexes()
100	132
101		self.store_neighbors()
	133
	134		#main calculations
102	135	self.calculate_metrics()	self.calculate_metrics()
103		self.calculate_corrections()
	136		self.calculate_advanced_metrics()
104	137	self.normalize_metrics()	self.normalize_metrics()
105	138	self.calculate_scores()	self.calculate_scores()
106	139	self.calculate_advanced_scores()	self.calculate_advanced_scores()
107	140
108
109
110		# write list of neighbors of each node to redis for navigation purposes in frontend
111		def store_neighbors(self):
112		for node in self.nodes:
113		node_neighbors = self.graph.neighbors(int(node))
114		self.redis.sadd(self.node_neighbors_prefix+str(node), *node_neighbors)
115
	141		#statistics
	142		self.calculate_statistics()
	143
	144		##################
	145		#### INDEXING ####
	146		##################
	147		def create_indexes(self):
	148		indexing.index_nodes(self)
	149		indexing.index_neighbors(self)
	150		indexing.index_metrics(self)
	151		indexing.index_scores(self)
	152
	153
	154		# def index_nodes(self):
	155		# self.redis.sadd(self.node_index_key, *self.nodes)
	156		#
	157		# def index_neighbors(self):
	158		# for node in self.nodes:
	159		# node_neighbors = self.graph.neighbors(int(node))
	160		# self.redis.sadd(self.node_neighbors_prefix+str(node), *node_neighbors)
	161		#
	162		# def index_metrics(self):
	163		# for metric in self.metrics:
	164		# self.redis.sadd(self.metric_index_key, metric)
	165		#
	166		# for advanced_metric in self.advanced_metrics:
	167		# self.redis.sadd(self.metric_index_key, advanced_metric)
	168		#
	169		# def index_scores(self):
	170		# for score in self.scores:
	171		# self.redis.sadd(self.score_index_key, score)
	172		#
	173		# for advanced_score in self.advanced_scores:
	174		# self.redis.sadd(self.score_index_key, advanced_score)
	175
	176		###########################
	177		#### CALCULATION LOOPS ####
	178		###########################
116	179	# loop through all defined metrics and call specified calculation method for each node	# loop through all defined metrics and call specified calculation method for each node
117	180	def calculate_metrics(self):	def calculate_metrics(self):
118		for metric_name in self.metrics:
119		metric_method = self.metrics[metric_name]
	181		for metric_name in self.base_metrics:
	182		metric_method = self.base_metrics[metric_name]
120	183
121	184	# loop through all nodes	# loop through all nodes
122	185	for node in self.nodes:	for node in self.nodes:
123	186
124	187	# call calculation method of supplied metric for current node	# call calculation method of supplied metric for current node
125	188	node = int(node)	node = int(node)
126		value = float(metric_method(node))
	189		value = float(metric_method(self,node))
127	190
128	191	#store result in node values	#store result in node values
129	192	self.redis.hset(self.node_prefix+str(node), metric_name, value)	self.redis.hset(self.node_prefix+str(node), metric_name, value)
130	193
131	194	#also store result to metric set	#also store result to metric set
132		self.redis.zadd(metric_name, value, str(node))
	195		self.redis.zadd(self.metric_prefix+metric_name, value, str(node))
133	196
134		# loop through all defined corrections and call specified calculation method
135		def calculate_corrections(self):
136		for correction_name in self.corrections:
137		correction_method = self.corrections[correction_name]
	197		# loop through all defined_advanced_metrics and call specified calculation method
	198		def calculate_advanced_metrics(self):
	199		for advanced_metric_name in self.advanced_metrics:
	200		metric_method = self.advanced_metrics[advanced_metric_name]
138	201	for node in self.nodes:	for node in self.nodes:
139	202	node = int(node)	node = int(node)
140		value = float(correction_method(node))
	203		value = float(metric_method(self,node))
141	204
142	205	#store result in node values	#store result in node values
143		self.redis.hset(self.node_prefix+str(node), correction_name, value)
	206		self.redis.hset(self.node_prefix+str(node), advanced_metric_name, value)
144	207
145	208	#also store result to metric set	#also store result to metric set
146		self.redis.zadd(correction_name, value, str(node))
	209		self.redis.zadd(self.metric_prefix+advanced_metric_name, value, str(node))
147	210
148	211
149	212	# loop through all defined normalizations and call respective normalization method	# loop through all defined normalizations and call respective normalization method
150	213	# no default normalizations for metrics not listed in the "normalization_methods" hash	# no default normalizations for metrics not listed in the "normalization_methods" hash
151	214	def normalize_metrics(self):	def normalize_metrics(self):
152		for metric_name in self.normalization_methods:
153		normalization_method = self.normalization_methods[metric_name]
154		normalization_method(metric_name)
155
156		# normalizations
157		# min max normalization
158		def min_max_normalization(self,metric_name):
159		#perform min max normalization of specified metric for all nodes
160		#min_max normalization
161		#get min and max from redis
162		x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
163		x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
164
165		#print x_min
166		#print x_max
	215		#fallback normalization: min-max
167	216
168		for node in self.nodes:
169		if x_min == x_max:
170		x_normalized = 1.0
	217		all_metrics = dict(self.base_metrics.items() + self.advanced_metrics.items())
	218
	219		for metric_name in all_metrics:
	220		if self.normalization_methods.has_key(metric_name):
	221		normalization_method = self.normalization_methods[metric_name]
171	222	else:	else:
172		x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
173		x_normalized = (x - x_min) / (x_max - x_min)
174
175		#store value for node and metric
176		self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
177		self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
178
179		#max min normalization
180		def inverse_min_max_normalization(self,metric_name):
181		x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
182		x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
	223		#fallback normalization is min-max
	224		normalization_method = normalizations.min_max
	225		normalization_method(self,metric_name)
183	226
184		for node in self.nodes:
185		if x_min == x_max:
186		x_normalized = 1.0
187		else:
188		x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
189		x_normalized = (x_max - x) / (x_max - x_min)
190
191		#store value for node and metric
192		self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
193		self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
194	227
195	228
	229
	230		# # normalizations
	231		# # min max normalization
	232		# def min_max_normalization(self,metric_name):
	233		# #perform min max normalization of specified metric for all nodes
	234		# #min_max normalization
	235		# #get min and max from redis
	236		# x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
	237		# x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
	238		#
	239		# #print x_min
	240		# #print x_max
	241		#
	242		# for node in self.nodes:
	243		# if x_min == x_max:
	244		# x_normalized = 1.0
	245		# else:
	246		# x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
	247		# x_normalized = (x - x_min) / (x_max - x_min)
	248		#
	249		# #store value for node and metric
	250		# self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
	251		# self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
	252		#
	253		# #max min normalization
	254		# def inverse_min_max_normalization(self,metric_name):
	255		# x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
	256		# x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
	257		#
	258		# for node in self.nodes:
	259		# if x_min == x_max:
	260		# x_normalized = 1.0
	261		# else:
	262		# x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
	263		# x_normalized = (x_max - x) / (x_max - x_min)
	264		#
	265		# #store value for node and metric
	266		# self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
	267		# self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
	268		#
196	269	def calculate_scores(self):	def calculate_scores(self):
197	270	for score_name in self.scores:	for score_name in self.scores:
198	271	metrics_with_weights = self.scores[score_name]	metrics_with_weights = self.scores[score_name]

...	...	class MetricCalculator(object):
200	273	for node in self.nodes:	for node in self.nodes:
201	274	score_value = 0.0	score_value = 0.0
202	275
	276		# get normalized values
203	277	for metric in metrics_with_weights:	for metric in metrics_with_weights:
204	278	weight = self.scores[score_name][metric]	weight = self.scores[score_name][metric]
205		value = float(self.redis.hget(self.node_prefix+str(node),metric))
	279		value = float(self.redis.hget(self.node_prefix+str(node),metric+self.normalization_suffix))
206	280	score_value += weight * value	score_value += weight * value
207	281
208	282	self.redis.hset(self.node_prefix+str(node),score_name, score_value)	self.redis.hset(self.node_prefix+str(node),score_name, score_value)
209		self.redis.zadd(score_name, score_value, str(node))
	283		self.redis.zadd(self.score_prefix+score_name, score_value, str(node))
210	284
211	285	def calculate_advanced_scores(self):	def calculate_advanced_scores(self):
212	286	for advanced_score in self.advanced_scores:	for advanced_score in self.advanced_scores:
213		self.advanced_scores[advanced_score]()
	287		self.advanced_scores[advanced_score](self)
214	288
215	289
216	290	###################################################	###################################################
217	291	# actual metrics and corrections etc. below	# actual metrics and corrections etc. below
218	292	# must return value which can be converted to float	# must return value which can be converted to float
	293		###################################################
	294		#
	295		# def clustering_coefficient(self,node):
	296		# #in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
	297		# #NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
	298		# # in this case, just returning nx.clustering(self.graph, node) might be better
	299		# if not hasattr(self, 'all_clustering_coefficients'):
	300		# self.all_clustering_coefficients = nx.clustering(self.graph)
	301		#
	302		# #get the actual value from the pre-calculated hash
	303		# return self.all_clustering_coefficients[node]
	304		#
	305		# def degree(self, node):
	306		# return self.graph.degree(node)
	307		#
	308		#
	309		# def average_neighbor_degree(self,node):
	310		# # same caching technique as in self.clustering_coefficient
	311		# # might also break for very large graphs
	312		# # nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
	313		#
	314		# if not hasattr(self, 'all_average_neighbor_degrees'):
	315		# self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
	316		# return self.all_average_neighbor_degrees[node]
	317		#
	318		# def iterated_average_neighbor_degree(self, node):
	319		#
	320		# first_level_neighbors = self.graph.neighbors(node)
	321		# second_level_neighbors = []
	322		#
	323		# # get all two-hop nodes
	324		# for first_level_neighbor in first_level_neighbors:
	325		# current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
	326		# second_level_neighbors.extend(current_second_level_neighbors)
	327		#
	328		# #remove one-hop nodes and self
	329		# relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
	330		#
	331		# degree_sum = 0
	332		# for relevant_node in relevant_nodes:
	333		# degree_sum += self.graph.degree(relevant_node)
	334		#
	335		# return float(degree_sum)/float(len(relevant_nodes))
	336		#
	337		# def betweenness_centrality(self, node):
	338		# if not hasattr(self, 'all_betweenness_centralities'):
	339		# self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
	340		# return self.all_betweenness_centralities[node]
	341		#
	342		# def eccentricity(self, node):
	343		# if not hasattr(self, 'all_eccentricities'):
	344		# self.all_eccentricities = nx.eccentricity(self.graph)
	345		# return self.all_eccentricities[node]
	346		#
	347		# def average_shortest_path_length(self, node):
	348		# # caching average_shortest_path_length for all nodes at one failed
	349		# # already switched to single calculation
	350		#
	351		# #get all shortest path lengths
	352		# all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
	353		#
	354		# #calculate average
	355		# sum_of_lengths = 0
	356		# for target in all_shortest_path_lengths_for_node:
	357		# sum_of_lengths += all_shortest_path_lengths_for_node[target]
	358		#
	359		# return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
	360		#
	361		#
	362		##############
	363		## corrections
	364		##############
	365		# def correct_clustering_coefficient(self,node):
	366		# clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
	367		# degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
	368		# corrected_cc = clustering_coefficient + (degree * clustering_coefficient) / float(4)
	369		#
	370		# return corrected_cc
	371		#
	372		# #def correct_clustering_coefficient(self):
	373		#
	374		# # for node in self.nodes:
	375		# # clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
	376		# # degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
	377		#
	378		# # corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
	379		#
	380		# # self.redis.hset(self.node_prefix+str(node), 'corrected_clustering_coefficient', corrected_cc)
	381		# # self.redis.zadd('corrected_clustering_coefficient', corrected_cc, str(node))
	382		#
	383		# def correct_average_neighbor_degree(self,node):
	384		# avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
	385		#
	386		# neighbors = self.graph.neighbors(node)
	387		# number_of_neighbors = float(len(neighbors))
	388		# neighbor_degrees = []
	389		# for neighbor in neighbors:
	390		# neighbor_degrees.append(self.graph.degree(neighbor))
	391		#
	392		# #using numpy median and standard deviation implementation
	393		# numpy_neighbor_degrees = np.array(neighbor_degrees)
	394		# median = np.median(numpy_neighbor_degrees)
	395		# standard_deviation = np.std(numpy_neighbor_degrees)
	396		#
	397		# if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
	398		# return avgnd
	399		# else:
	400		# return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
	401		#
	402		#
	403		# def correct_iterated_average_neighbor_degree(self, node):
	404		# avgnd = float(self.redis.hget(self.node_prefix+str(node), 'iterated_average_neighbor_degree'))
	405		#
	406		# first_level_neighbors = self.graph.neighbors(node)
	407		# second_level_neighbors = []
	408		#
	409		# # get all two-hop nodes
	410		# for first_level_neighbor in first_level_neighbors:
	411		# current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
	412		# second_level_neighbors.extend(current_second_level_neighbors)
	413		#
	414		# #remove one-hop neighbors and self
	415		# relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
	416		#
	417		# number_of_nodes = len(relevant_nodes)
	418		# node_degrees = []
	419		# for rel_node in relevant_nodes:
	420		# node_degrees.append(self.graph.degree(rel_node))
	421		#
	422		# numpy_node_degrees = np.array(node_degrees)
	423		# median = np.median(numpy_node_degrees)
	424		# standard_deviation = np.std(numpy_node_degrees)
	425		#
	426		# if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
	427		# return avgnd
	428		# else:
	429		# return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
	430		#
	431		#
	432		#
	433		#
	434		#################
	435		##advanced scores
	436		#################
	437		#
	438		# def urs_clustering_coefficient_modification(self):
	439		#
	440		# #caching of values
	441		# all_ccs_normalized = dict(self.redis.zrange('corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
	442		# all_urs = dict(self.redis.zrange('unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
	443		#
	444		# urs_percentile_10 = np.percentile(all_urs.values(), 10)
	445		# urs_percentile_90 = np.percentile(all_urs.values(), 90)
	446		#
	447		# for node in self.nodes:
	448		# #cc_normalized = float(self.redis.hget(self.node_prefix+str(node),'corrected_clustering_coefficient'+self.normalization_suffix))
	449		# #urs = float(self.redis.hget(self.node_prefix+str(node),'unified_risk_score'))
	450		#
	451		# cc_normalized = all_ccs_normalized[str(node)]
	452		# urs = all_urs[str(node)]
	453		#
	454		#
	455		# if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
	456		# if (cc_normalized >= 0.25):
	457		# advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
	458		# else:
	459		# advanced_unified_risk_score = urs
	460		# else:
	461		# advanced_unified_risk_score = urs
	462		#
	463		# #save for node
	464		# self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
	465		# #save for metric
	466		# self.redis.zadd('advanced_unified_risk_score', advanced_unified_risk_score, str(node))
	467
	468		#############
	469		# statistics
	470		#############
	471
	472		def calculate_statistics(self):
	473		for metric in self.base_metrics:
	474		#absolute and normalized
	475		statistics.calculate_statistics(self, metric, self.metric_prefix+metric)
	476		statistics.calculate_statistics(self, metric+self.normalization_suffix, self.metric_prefix+metric+self.normalization_suffix)
219	477
220		def clustering_coefficient(self,node):
221		#in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
222		#NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
223		# in this case, just returning nx.clustering(self.graph, node) might be better
224		if not hasattr(self, 'all_clustering_coefficients'):
225		self.all_clustering_coefficients = nx.clustering(self.graph)
226
227		#get the actual value from the pre-calculated hash
228		return self.all_clustering_coefficients[node]
229
230		def degree(self, node):
231		return self.graph.degree(node)
232
233
234		def average_neighbor_degree(self,node):
235		# same caching technique as in self.clustering_coefficient
236		# might also break for very large graphs
237		# nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
238
239		if not hasattr(self, 'all_average_neighbor_degrees'):
240		self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
241		return self.all_average_neighbor_degrees[node]
242
243		def iterated_average_neighbor_degree(self, node):
244
245		first_level_neighbors = self.graph.neighbors(node)
246		second_level_neighbors = []
247
248		# get all two-hop nodes
249		for first_level_neighbor in first_level_neighbors:
250		current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
251		second_level_neighbors.extend(current_second_level_neighbors)
252
253		#remove one-hop nodes and self
254		relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
255
256		degree_sum = 0
257		for relevant_node in relevant_nodes:
258		degree_sum += self.graph.degree(relevant_node)
259
260		return float(degree_sum)/float(len(relevant_nodes))
261
262		def betweenness_centrality(self, node):
263		if not hasattr(self, 'all_betweenness_centralities'):
264		self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
265		return self.all_betweenness_centralities[node]
266
267		def eccentricity(self, node):
268		if not hasattr(self, 'all_eccentricities'):
269		self.all_eccentricities = nx.eccentricity(self.graph)
270		return self.all_eccentricities[node]
271
272		def average_shortest_path_length(self, node):
273		# caching average_shortest_path_length for all nodes at one failed
274		# already switched to single calculation
275
276		#get all shortest path lengths
277		all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
278
279		#calculate average
280		sum_of_lengths = 0
281		for target in all_shortest_path_lengths_for_node:
282		sum_of_lengths += all_shortest_path_lengths_for_node[target]
283
284		return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
285
286
287		#############
288		# corrections
289		#############
290		def correct_clustering_coefficient(self,node):
291		clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
292		degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
293		corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
294
295		return corrected_cc
296
297		#def correct_clustering_coefficient(self):
298
299		# for node in self.nodes:
300		# clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
301		# degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
302
303		# corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
304
305		# self.redis.hset(self.node_prefix+str(node), 'corrected_clustering_coefficient', corrected_cc)
306		# self.redis.zadd('corrected_clustering_coefficient', corrected_cc, str(node))
307
308		def correct_average_neighbor_degree(self,node):
309		avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
310
311		neighbors = self.graph.neighbors(node)
312		number_of_neighbors = float(len(neighbors))
313		neighbor_degrees = []
314		for neighbor in neighbors:
315		neighbor_degrees.append(self.graph.degree(neighbor))
316
317		#using numpy median and standard deviation implementation
318		numpy_neighbor_degrees = np.array(neighbor_degrees)
319		median = np.median(numpy_neighbor_degrees)
320		standard_deviation = np.std(numpy_neighbor_degrees)
321
322		if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
323		return avgnd
324		else:
325		return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
326
327		#return 18
328
329		def correct_iterated_average_neighbor_degree(self, node):
330		avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
331
332		first_level_neighbors = self.graph.neighbors(node)
333		second_level_neighbors = []
334
335		# get all two-hop nodes
336		for first_level_neighbor in first_level_neighbors:
337		current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
338		second_level_neighbors.extend(current_second_level_neighbors)
339
340		#remove one-hop neighbors and self
341		relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
342
343		number_of_nodes = len(relevant_nodes)
344		node_degrees = []
345		for rel_node in relevant_nodes:
346		node_degrees.append(self.graph.degree(rel_node))
347
348		numpy_node_degrees = np.array(node_degrees)
349		median = np.median(numpy_node_degrees)
350		standard_deviation = np.std(numpy_node_degrees)
351
352		if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
353		return avgnd
354		else:
355		return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
356
357
358
359
360		################
361		#advanced scores
362		################
363
364		def urs_clustering_coefficient_modification(self):
365
366		#caching of values
367		all_ccs_normalized = dict(self.redis.zrange('corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
368		all_urs = dict(self.redis.zrange('unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
369
370		urs_percentile_10 = np.percentile(all_urs.values(), 10)
371		urs_percentile_90 = np.percentile(all_urs.values(), 90)
372
373		for node in self.nodes:
374		#cc_normalized = float(self.redis.hget(self.node_prefix+str(node),'corrected_clustering_coefficient'+self.normalization_suffix))
375		#urs = float(self.redis.hget(self.node_prefix+str(node),'unified_risk_score'))
376
377		cc_normalized = all_ccs_normalized[str(node)]
378		urs = all_urs[str(node)]
379
380
381
382		if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
383		if (cc_normalized >= 0.25):
384		advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
385		else:
386		advanced_unified_risk_score = urs
387		else:
388		advanced_unified_risk_score = urs
	478		for advanced_metric in self.advanced_metrics:
	479		#absolute and normalized
	480		statistics.calculate_statistics(self, advanced_metric, self.metric_prefix+advanced_metric)
	481		statistics.calculate_statistics(self, advanced_metric+self.normalization_suffix, self.metric_prefix+advanced_metric+self.normalization_suffix)
389	482
390		#save for node
391		self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
392		#save for metric
393		self.redis.zadd('advanced_unified_risk_score', advanced_unified_risk_score, str(node))
	483		for score in self.scores:
	484		statistics.calculate_statistics(self, score, self.score_prefix+score)
394	485
395
	486		for advanced_score in self.advanced_scores:
	487		statistics.calculate_statistics(self, advanced_score, self.score_prefix+advanced_score)
	488
	489		statistics.calculate_correlations(self)
	490
	491		#
	492		#
	493		# def calculate_statistics_for_absolute_values(self,metric):
	494		# all_values = dict(self.redis.zrange(metric, 0, -1, withscores=True, score_cast_func=float)).values()
	495		# min_value = np.min(np.array(all_values))
	496		# max_value = np.max(all_values)
	497		#
	498		# average = np.average(all_values)
	499		# median = np.median(all_values)
	500		# standard_deviation = np.std(all_values)
	501		#
	502		# self.redis.hset(self.statistics_prefix+str(metric), 'min', min_value)
	503		# self.redis.hset(self.statistics_prefix+str(metric), 'max', max_value)
	504		# self.redis.hset(self.statistics_prefix+str(metric), 'average', average)
	505		# self.redis.hset(self.statistics_prefix+str(metric), 'median', median)
	506		# self.redis.hset(self.statistics_prefix+str(metric), 'standard_deviation', standard_deviation)
	507		#
	508		# def calculate_statistics_for_normalized_values(self,metric):
	509		# all_values = dict(self.redis.zrange(metric+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float)).values()
	510		#
	511		# min_value = np.min(all_values)
	512		# max_value = np.max(all_values)
	513		#
	514		# average = np.average(all_values)
	515		# median = np.median(all_values)
	516		# standard_deviation = np.std(all_values)
	517		#
	518		# self.redis.hset(self.statistics_prefix+str(metric)+self.normalization_suffix, 'min', min_value)
	519		# self.redis.hset(self.statistics_prefix+str(metric)+self.normalization_suffix, 'max', max_value)
	520		# self.redis.hset(self.statistics_prefix+str(metric)+self.normalization_suffix, 'average', average)
	521		# self.redis.hset(self.statistics_prefix+str(metric)+self.normalization_suffix, 'median', median)
	522		# self.redis.hset(self.statistics_prefix+str(metric)+self.normalization_suffix, 'standard_deviation', standard_deviation)
	523		#
	524		#
	525		# def calculate_correlations(self):
	526		# m = self.metrics.keys()
	527		# c = self.corrections.keys()
	528		#
	529		# metrics = m + c
	530		#
	531		# correlations = {}
	532		# for metric1 in metrics:
	533		# correlations[metric1] = {}
	534		# for metric2 in metrics:
	535		# correlations[metric1][metric2] = (0,0)
	536		# if metric1 == metric2:
	537		# correlations[metric1][metric2] = (1,0)
	538		# continue
	539		#
	540		# dict_metric1 = dict(self.redis.zrange(metric1, 0, -1, withscores=True, score_cast_func=float))
	541		# dict_metric2 = dict(self.redis.zrange(metric2, 0, -1, withscores=True, score_cast_func=float))
	542		# values_metric1 = []
	543		# values_metric2 = []
	544		#
	545		# for key in sorted(dict_metric1.iterkeys()):
	546		# values_metric1.append(dict_metric1[key])
	547		#
	548		# for key in sorted(dict_metric2.iterkeys()):
	549		# values_metric2.append(dict_metric2[key])
	550		#
	551		# correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
	552		#
	553		# values_metric1 = []
	554		# values_metric2 = []
	555		#
	556		# for source in correlations:
	557		# for target in correlations[source]:
	558		# self.redis.hset("correlations:"+source+":"+target, "correlation", correlations[source][target][0])
	559		# self.redis.hset("correlations:"+source+":"+target, "confidence", correlations[source][target][1])

File metric_calculator.pyc deleted (index ff74136..0000000)

File profiling.py deleted (index 798f46e..0000000)
1		from metric_calculator import MetricCalculator
2		import networkx as nx
3		import redis as rd
4
5		import cProfile, pstats, StringIO
6
7		redis = rd.StrictRedis(host='localhost', port=6379, db=0)
8
9		#random_runs = [[100,0.2],[100,0.3]]
10		random_runs = [[1000,0.05],[1000,0.1],[1000,0.2],[10000,0.3],[1000,0.4],[2000,0.2],[3000,0.2],[4000,0.2],[5000,0.2],[6000,0.2]]
11
12
13		for graph_configuration in random_runs:
14
15		number_of_nodes = graph_configuration[0]
16		probability_of_connection = graph_configuration[1]
17
18		graph = nx.fast_gnp_random_graph(number_of_nodes,probability_of_connection,seed=1)
19
20		nodes = nx.nodes(graph)
21		#barabasi_albert_graph(n, m, seed=None)[source]
22
23		if not nx.is_connected(graph):
24		print "not connected"
25		sys.exit(-1)
26
27		redis.flushdb()
28		redis.sadd('all_nodes', *nodes)
29
30		mc = MetricCalculator(graph)
31
32		pr = cProfile.Profile()
33		pr.enable()
34
35		mc.start()
36
37		s = StringIO.StringIO()
38		sortby = 'cumulative'
39		ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
40		ps.print_stats()
41
42		outfile = open('auto_profiling_output_'+str(number_of_nodes)+'_'+str(probability_of_connection)+'.txt', 'w')
43		outfile.write(s.getvalue())

File run.py deleted (index 682220f..0000000)
1		#!/usr/bin/env python
2
3		from file_importer import FileImporter
4		from metric_calculator import MetricCalculator
5
6		import cProfile, pstats, StringIO
7
8		import networkx as nx
9		import redis as rd
10
11		# start import
12		#fi = FileImporter('data/Dataset_2012.txt')
13		#fi = FileImporter('data/test_dataset.txt')
14		#graph = fi.read()
15
16		#print "Nodes:"
17		#print graph.number_of_nodes()
18		#print "Edges:"
19		#print graph.number_of_edges()
20
21		redis = rd.StrictRedis(host='localhost', port=6379, db=0)
22		redis.flushdb()
23		all_nodes = range(1,100)
24		graph = nx.fast_gnp_random_graph(100,0.15,seed=1)
25		redis.sadd('all_nodes', *all_nodes)
26
27		mc = MetricCalculator(graph)
28
29		pr = cProfile.Profile()
30		pr.enable()
31
32		mc.start()
33
34		s = StringIO.StringIO()
35		sortby = 'cumulative'
36		ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
37		ps.print_stats()
38
39		outfile = open('profiling_run_result.txt', 'w')
40		outfile.write(s.getvalue())

File test.py deleted (index 853e9c1..0000000)
1		#redis test
2		import redis
3		r = redis.StrictRedis(host='localhost', port=6379, db=0)
4
5		nodes = [1,2,3,4,5,6,7,8,9]
6		for node in nodes:
7		print str(node)
8		print r.get('node:'+str(node)+':degree')
9		print r.get('node:'+str(node)+':average_neighbor_degree')
10		print r.get('node:'+str(node)+':eccentricity')
11		print r.get('node:'+str(node)+':betweenness_centrality')
12		print r.get('node:'+str(node)+':clustering_coefficient')
13		print r.get('node:'+str(node)+':average_shortest_path_length')
14
15
16
17		print r.get('all_nodes').strip('[]').split(', ').type()

Hints:
Before first commit, do not forget to setup your git environment:

git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):

git clone https://rocketgit.com/user/coria/coria-backend

Clone this repository using ssh (do not forget to upload a key first):

git clone ssh://rocketgit@ssh.rocketgit.com/user/coria/coria-backend

Clone this repository using git:

git clone git://git.rocketgit.com/user/coria/coria-backend

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:

... clone the repository ...
... make some changes and some commits ...
git push origin main