RocketGit

coria / coria-backend (public) (License: Unspecified) (since 2017-02-23) (hash sha1)

No description available

Clone URLs: https://rocketgit.com/user/coria/coria-backend ssh://rocketgit@ssh.rocketgit.com/user/coria/coria-backend git://git.rocketgit.com/user/coria/coria-backend

feature/coria-ver1 feature/coria-ver1.5 feature/coria-ver2 master

List of commits:

Subject	Hash	Author	Date (UTC)
initial commit - pre colloquim state	655c77556f9d8e40b52893887cdb0d90f726fdbf	Mathias Ehlert	2013-11-22 13:47:29
Initial commit	f53ec7a3f25d55c53aa12c2682b216e16570cdc7	Mathias Ehlert	2013-11-22 13:37:47

Commit 655c77556f9d8e40b52893887cdb0d90f726fdbf - initial commit - pre colloquim state
Author: Mathias Ehlert
Author date (UTC): 2013-11-22 13:47
Committer name: Mathias Ehlert
Committer date (UTC): 2013-11-22 13:47
Parent(s): f53ec7a3f25d55c53aa12c2682b216e16570cdc7
Signer:
Signing key:
Signing status: N
Tree: b66c7ee4e0ce05cf15db3f09fcf7a86cbb42b892

File	Lines added	Lines deleted
__init__.py	0	0
data/Dataset_2012.txt	244757	0
data/test_dataset.txt	10	0
file_importer.py	28	0
file_importer.pyc	0	0
metric_calculator.py	395	0
metric_calculator.pyc	0	0
profiling.py	43	0
run.py	40	0
test.py	17	0

File __init__.py added (mode: 100644) (index 0000000..e69de29)

The diff for file data/Dataset_2012.txt is too big (244757 changes) and cannot be shown.

File data/test_dataset.txt added (mode: 100644) (index 0000000..47cab8b)
	1	1 2
	2	1 3
	3	1 4
	4	2 5
	5	2 6
	6	3 4
	7	3 6
	8	4 7
	9	5 8
	10	6 9

File file_importer.py added (mode: 100644) (index 0000000..9796f9f)
	1	import networkx as nx
	2	import redis as rd
	3
	4	class FileImporter(object):
	5	def __init__(self,filename):
	6	self.data_file = open(filename)
	7	self.all_nodes = []
	8	self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
	9	self.graph = nx.Graph()
	10
	11	def read(self):
	12	self.redis.flushdb()
	13	for line in self.data_file:
	14	self.parse_line(line)
	15	self.save_all_nodes()
	16	return self.graph
	17
	18	def parse_line(self, line):
	19	fields = line.strip().split("\t")
	20	from_node = int(fields[0])
	21	to_node = int(fields[1])
	22	self.all_nodes.extend([from_node,to_node])
	23	self.graph.add_edge(from_node, to_node)
	24
	25	def save_all_nodes(self):
	26	self.unique_nodes = list(set(self.all_nodes))
	27	self.unique_nodes.sort()
	28	self.redis.sadd('all_nodes', *self.unique_nodes)

File file_importer.pyc added (mode: 100644) (index 0000000..9e675cc)

File metric_calculator.py added (mode: 100644) (index 0000000..a573c4e)
	1	import networkx as nx
	2	import redis as rd
	3	import numpy as np
	4
	5
	6	class MetricCalculator(object):
	7	def __init__ (self, graph):
	8	self.graph = graph
	9	self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
	10	self.nodes = nx.nodes(graph)
	11
	12	self.node_neighbors_prefix = 'node_neighbors:'
	13	self.node_prefix = 'node_metrics:'
	14	self.normalization_suffix = '_normalized'
	15
	16	# definition of all base metrics for which absolute values will be calculcated for each node in the first step
	17	# key is the name of the metric and value is the implemented method which exposes the required interface
	18	# interface: each method takes the node as the single parameter, performs the necessary calculation and
	19	# returns a float containing the value for the specified node
	20
	21	self.metrics = { 'clustering_coefficient' : self.clustering_coefficient,
	22	'degree' : self.degree,
	23	'average_neighbor_degree' : self.average_neighbor_degree,
	24	'iterated_average_neighbor_degree': self.iterated_average_neighbor_degree,
	25	'betweenness_centrality' : self.betweenness_centrality,
	26	'eccentricity' : self.eccentricity,
	27	'average_shortest_path_length' : self.average_shortest_path_length
	28	}
	29
	30
	31	# for the frontend
	32	# self.metric_names = {
	33	# 'clustering_coefficient' : 'Clustering Coefficient',
	34	# 'degree' : 'Node Degree',
	35	# 'average_neighbor_degree' : 'Average Neighbor Node Degree',
	36	# 'iterated_average_neighbor_degree': 'Iterated Average Neighbor Node Degree',
	37	# 'betweenness_centrality' : 'Betweenness Centrality',
	38	# 'eccentricity' : 'Node Eccentricity',
	39	# 'average_shortest_path_length' : 'Average Shortest Path Length'
	40	# }
	41
	42
	43	# some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
	44	# key is the metric name and value the method for correction
	45
	46
	47	self.corrections = {'corrected_clustering_coefficient' : self.correct_clustering_coefficient,
	48	'corrected_average_neighbor_degree' : self.correct_average_neighbor_degree,
	49	'corrected_iterated_average_neighbor_degree': self.correct_iterated_average_neighbor_degree}
	50
	51
	52
	53	# for every metric, a normalization method has to be specified
	54	# key is the name of the metric and value is the normalization method which also has to expose the required interface
	55	# interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
	56	# the method itself shall access the data which is required for normalization from the redis instance
	57	# and the corresponding keys/values for the specified metric
	58	# it shall then loop over all nodes and calculate the normalized value for the node and the metric
	59	# afterwards it should save the result to redis using "metric_name_normalized" as the key
	60	# the result is stored inside the node's hash for metrics
	61
	62	# also needs to include corrected metrics with their respective names
	63	#
	64	self.normalization_methods = { 'clustering_coefficient' : self.min_max_normalization,
	65	'corrected_clustering_coefficient' : self.min_max_normalization,
	66	'degree' : self.min_max_normalization,
	67	'average_neighbor_degree' : self.min_max_normalization,
	68	'corrected_average_neighbor_degree' : self.min_max_normalization,
	69	'iterated_average_neighbor_degree' : self.min_max_normalization,
	70	'corrected_iterated_average_neighbor_degree': self.min_max_normalization,
	71	'betweenness_centrality' : self.min_max_normalization,
	72	'eccentricity' : self.inverse_min_max_normalization,
	73	'average_shortest_path_length' : self.inverse_min_max_normalization
	74	}
	75
	76
	77	# the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
	78	# such scores can easily be defined here
	79
	80	#self.scores = ['unified_risk_score']
	81
	82	self.scores = {'unified_risk_score': { #'corrected_clustering_coefficient': 0.2,
	83	'degree_normalized': 0.25,
	84	'corrected_average_neighbor_degree_normalized': 0.15,
	85	'corrected_iterated_average_neighbor_degree_normalized': 0.1,
	86	'betweenness_centrality_normalized': 0.25,
	87	'eccentricity_normalized': 0.125,
	88	'average_shortest_path_length_normalized': 0.125}
	89	}
	90
	91
	92	# other scores might require a more sophisticated algorithm to be calculated
	93	# such scores need to be added here and implemented like the example below
	94
	95	self.advanced_scores = {'advanced_unified_risk_score': self.urs_clustering_coefficient_modification}
	96
	97
	98
	99	def start(self):
	100
	101	self.store_neighbors()
	102	self.calculate_metrics()
	103	self.calculate_corrections()
	104	self.normalize_metrics()
	105	self.calculate_scores()
	106	self.calculate_advanced_scores()
	107
	108
	109
	110	# write list of neighbors of each node to redis for navigation purposes in frontend
	111	def store_neighbors(self):
	112	for node in self.nodes:
	113	node_neighbors = self.graph.neighbors(int(node))
	114	self.redis.sadd(self.node_neighbors_prefix+str(node), *node_neighbors)
	115
	116	# loop through all defined metrics and call specified calculation method for each node
	117	def calculate_metrics(self):
	118	for metric_name in self.metrics:
	119	metric_method = self.metrics[metric_name]
	120
	121	# loop through all nodes
	122	for node in self.nodes:
	123
	124	# call calculation method of supplied metric for current node
	125	node = int(node)
	126	value = float(metric_method(node))
	127
	128	#store result in node values
	129	self.redis.hset(self.node_prefix+str(node), metric_name, value)
	130
	131	#also store result to metric set
	132	self.redis.zadd(metric_name, value, str(node))
	133
	134	# loop through all defined corrections and call specified calculation method
	135	def calculate_corrections(self):
	136	for correction_name in self.corrections:
	137	correction_method = self.corrections[correction_name]
	138	for node in self.nodes:
	139	node = int(node)
	140	value = float(correction_method(node))
	141
	142	#store result in node values
	143	self.redis.hset(self.node_prefix+str(node), correction_name, value)
	144
	145	#also store result to metric set
	146	self.redis.zadd(correction_name, value, str(node))
	147
	148
	149	# loop through all defined normalizations and call respective normalization method
	150	# no default normalizations for metrics not listed in the "normalization_methods" hash
	151	def normalize_metrics(self):
	152	for metric_name in self.normalization_methods:
	153	normalization_method = self.normalization_methods[metric_name]
	154	normalization_method(metric_name)
	155
	156	# normalizations
	157	# min max normalization
	158	def min_max_normalization(self,metric_name):
	159	#perform min max normalization of specified metric for all nodes
	160	#min_max normalization
	161	#get min and max from redis
	162	x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
	163	x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
	164
	165	#print x_min
	166	#print x_max
	167
	168	for node in self.nodes:
	169	if x_min == x_max:
	170	x_normalized = 1.0
	171	else:
	172	x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
	173	x_normalized = (x - x_min) / (x_max - x_min)
	174
	175	#store value for node and metric
	176	self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
	177	self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
	178
	179	#max min normalization
	180	def inverse_min_max_normalization(self,metric_name):
	181	x_min = self.redis.zrange(metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
	182	x_max = self.redis.zrange(metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
	183
	184	for node in self.nodes:
	185	if x_min == x_max:
	186	x_normalized = 1.0
	187	else:
	188	x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
	189	x_normalized = (x_max - x) / (x_max - x_min)
	190
	191	#store value for node and metric
	192	self.redis.zadd(metric_name+self.normalization_suffix, x_normalized, str(node))
	193	self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
	194
	195
	196	def calculate_scores(self):
	197	for score_name in self.scores:
	198	metrics_with_weights = self.scores[score_name]
	199
	200	for node in self.nodes:
	201	score_value = 0.0
	202
	203	for metric in metrics_with_weights:
	204	weight = self.scores[score_name][metric]
	205	value = float(self.redis.hget(self.node_prefix+str(node),metric))
	206	score_value += weight * value
	207
	208	self.redis.hset(self.node_prefix+str(node),score_name, score_value)
	209	self.redis.zadd(score_name, score_value, str(node))
	210
	211	def calculate_advanced_scores(self):
	212	for advanced_score in self.advanced_scores:
	213	self.advanced_scores[advanced_score]()
	214
	215
	216	###################################################
	217	# actual metrics and corrections etc. below
	218	# must return value which can be converted to float
	219
	220	def clustering_coefficient(self,node):
	221	#in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
	222	#NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
	223	# in this case, just returning nx.clustering(self.graph, node) might be better
	224	if not hasattr(self, 'all_clustering_coefficients'):
	225	self.all_clustering_coefficients = nx.clustering(self.graph)
	226
	227	#get the actual value from the pre-calculated hash
	228	return self.all_clustering_coefficients[node]
	229
	230	def degree(self, node):
	231	return self.graph.degree(node)
	232
	233
	234	def average_neighbor_degree(self,node):
	235	# same caching technique as in self.clustering_coefficient
	236	# might also break for very large graphs
	237	# nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
	238
	239	if not hasattr(self, 'all_average_neighbor_degrees'):
	240	self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
	241	return self.all_average_neighbor_degrees[node]
	242
	243	def iterated_average_neighbor_degree(self, node):
	244
	245	first_level_neighbors = self.graph.neighbors(node)
	246	second_level_neighbors = []
	247
	248	# get all two-hop nodes
	249	for first_level_neighbor in first_level_neighbors:
	250	current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
	251	second_level_neighbors.extend(current_second_level_neighbors)
	252
	253	#remove one-hop nodes and self
	254	relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
	255
	256	degree_sum = 0
	257	for relevant_node in relevant_nodes:
	258	degree_sum += self.graph.degree(relevant_node)
	259
	260	return float(degree_sum)/float(len(relevant_nodes))
	261
	262	def betweenness_centrality(self, node):
	263	if not hasattr(self, 'all_betweenness_centralities'):
	264	self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
	265	return self.all_betweenness_centralities[node]
	266
	267	def eccentricity(self, node):
	268	if not hasattr(self, 'all_eccentricities'):
	269	self.all_eccentricities = nx.eccentricity(self.graph)
	270	return self.all_eccentricities[node]
	271
	272	def average_shortest_path_length(self, node):
	273	# caching average_shortest_path_length for all nodes at one failed
	274	# already switched to single calculation
	275
	276	#get all shortest path lengths
	277	all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
	278
	279	#calculate average
	280	sum_of_lengths = 0
	281	for target in all_shortest_path_lengths_for_node:
	282	sum_of_lengths += all_shortest_path_lengths_for_node[target]
	283
	284	return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
	285
	286
	287	#############
	288	# corrections
	289	#############
	290	def correct_clustering_coefficient(self,node):
	291	clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
	292	degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
	293	corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
	294
	295	return corrected_cc
	296
	297	#def correct_clustering_coefficient(self):
	298
	299	# for node in self.nodes:
	300	# clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
	301	# degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
	302
	303	# corrected_cc = clustering_coefficient * (degree * clustering_coefficient) / float(4)
	304
	305	# self.redis.hset(self.node_prefix+str(node), 'corrected_clustering_coefficient', corrected_cc)
	306	# self.redis.zadd('corrected_clustering_coefficient', corrected_cc, str(node))
	307
	308	def correct_average_neighbor_degree(self,node):
	309	avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
	310
	311	neighbors = self.graph.neighbors(node)
	312	number_of_neighbors = float(len(neighbors))
	313	neighbor_degrees = []
	314	for neighbor in neighbors:
	315	neighbor_degrees.append(self.graph.degree(neighbor))
	316
	317	#using numpy median and standard deviation implementation
	318	numpy_neighbor_degrees = np.array(neighbor_degrees)
	319	median = np.median(numpy_neighbor_degrees)
	320	standard_deviation = np.std(numpy_neighbor_degrees)
	321
	322	if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
	323	return avgnd
	324	else:
	325	return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
	326
	327	#return 18
	328
	329	def correct_iterated_average_neighbor_degree(self, node):
	330	avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
	331
	332	first_level_neighbors = self.graph.neighbors(node)
	333	second_level_neighbors = []
	334
	335	# get all two-hop nodes
	336	for first_level_neighbor in first_level_neighbors:
	337	current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
	338	second_level_neighbors.extend(current_second_level_neighbors)
	339
	340	#remove one-hop neighbors and self
	341	relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
	342
	343	number_of_nodes = len(relevant_nodes)
	344	node_degrees = []
	345	for rel_node in relevant_nodes:
	346	node_degrees.append(self.graph.degree(rel_node))
	347
	348	numpy_node_degrees = np.array(node_degrees)
	349	median = np.median(numpy_node_degrees)
	350	standard_deviation = np.std(numpy_node_degrees)
	351
	352	if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
	353	return avgnd
	354	else:
	355	return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
	356
	357
	358
	359
	360	################
	361	#advanced scores
	362	################
	363
	364	def urs_clustering_coefficient_modification(self):
	365
	366	#caching of values
	367	all_ccs_normalized = dict(self.redis.zrange('corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
	368	all_urs = dict(self.redis.zrange('unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
	369
	370	urs_percentile_10 = np.percentile(all_urs.values(), 10)
	371	urs_percentile_90 = np.percentile(all_urs.values(), 90)
	372
	373	for node in self.nodes:
	374	#cc_normalized = float(self.redis.hget(self.node_prefix+str(node),'corrected_clustering_coefficient'+self.normalization_suffix))
	375	#urs = float(self.redis.hget(self.node_prefix+str(node),'unified_risk_score'))
	376
	377	cc_normalized = all_ccs_normalized[str(node)]
	378	urs = all_urs[str(node)]
	379
	380
	381
	382	if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
	383	if (cc_normalized >= 0.25):
	384	advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
	385	else:
	386	advanced_unified_risk_score = urs
	387	else:
	388	advanced_unified_risk_score = urs
	389
	390	#save for node
	391	self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
	392	#save for metric
	393	self.redis.zadd('advanced_unified_risk_score', advanced_unified_risk_score, str(node))
	394
	395

File metric_calculator.pyc added (mode: 100644) (index 0000000..ff74136)

File profiling.py added (mode: 100644) (index 0000000..798f46e)
	1	from metric_calculator import MetricCalculator
	2	import networkx as nx
	3	import redis as rd
	4
	5	import cProfile, pstats, StringIO
	6
	7	redis = rd.StrictRedis(host='localhost', port=6379, db=0)
	8
	9	#random_runs = [[100,0.2],[100,0.3]]
	10	random_runs = [[1000,0.05],[1000,0.1],[1000,0.2],[10000,0.3],[1000,0.4],[2000,0.2],[3000,0.2],[4000,0.2],[5000,0.2],[6000,0.2]]
	11
	12
	13	for graph_configuration in random_runs:
	14
	15	number_of_nodes = graph_configuration[0]
	16	probability_of_connection = graph_configuration[1]
	17
	18	graph = nx.fast_gnp_random_graph(number_of_nodes,probability_of_connection,seed=1)
	19
	20	nodes = nx.nodes(graph)
	21	#barabasi_albert_graph(n, m, seed=None)[source]
	22
	23	if not nx.is_connected(graph):
	24	print "not connected"
	25	sys.exit(-1)
	26
	27	redis.flushdb()
	28	redis.sadd('all_nodes', *nodes)
	29
	30	mc = MetricCalculator(graph)
	31
	32	pr = cProfile.Profile()
	33	pr.enable()
	34
	35	mc.start()
	36
	37	s = StringIO.StringIO()
	38	sortby = 'cumulative'
	39	ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
	40	ps.print_stats()
	41
	42	outfile = open('auto_profiling_output_'+str(number_of_nodes)+'_'+str(probability_of_connection)+'.txt', 'w')
	43	outfile.write(s.getvalue())

File run.py added (mode: 100644) (index 0000000..682220f)
	1	#!/usr/bin/env python
	2
	3	from file_importer import FileImporter
	4	from metric_calculator import MetricCalculator
	5
	6	import cProfile, pstats, StringIO
	7
	8	import networkx as nx
	9	import redis as rd
	10
	11	# start import
	12	#fi = FileImporter('data/Dataset_2012.txt')
	13	#fi = FileImporter('data/test_dataset.txt')
	14	#graph = fi.read()
	15
	16	#print "Nodes:"
	17	#print graph.number_of_nodes()
	18	#print "Edges:"
	19	#print graph.number_of_edges()
	20
	21	redis = rd.StrictRedis(host='localhost', port=6379, db=0)
	22	redis.flushdb()
	23	all_nodes = range(1,100)
	24	graph = nx.fast_gnp_random_graph(100,0.15,seed=1)
	25	redis.sadd('all_nodes', *all_nodes)
	26
	27	mc = MetricCalculator(graph)
	28
	29	pr = cProfile.Profile()
	30	pr.enable()
	31
	32	mc.start()
	33
	34	s = StringIO.StringIO()
	35	sortby = 'cumulative'
	36	ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
	37	ps.print_stats()
	38
	39	outfile = open('profiling_run_result.txt', 'w')
	40	outfile.write(s.getvalue())

File test.py added (mode: 100644) (index 0000000..853e9c1)
	1	#redis test
	2	import redis
	3	r = redis.StrictRedis(host='localhost', port=6379, db=0)
	4
	5	nodes = [1,2,3,4,5,6,7,8,9]
	6	for node in nodes:
	7	print str(node)
	8	print r.get('node:'+str(node)+':degree')
	9	print r.get('node:'+str(node)+':average_neighbor_degree')
	10	print r.get('node:'+str(node)+':eccentricity')
	11	print r.get('node:'+str(node)+':betweenness_centrality')
	12	print r.get('node:'+str(node)+':clustering_coefficient')
	13	print r.get('node:'+str(node)+':average_shortest_path_length')
	14
	15
	16
	17	print r.get('all_nodes').strip('[]').split(', ').type()

Hints:
Before first commit, do not forget to setup your git environment:

git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):

git clone https://rocketgit.com/user/coria/coria-backend

Clone this repository using ssh (do not forget to upload a key first):

git clone ssh://rocketgit@ssh.rocketgit.com/user/coria/coria-backend

Clone this repository using git:

git clone git://git.rocketgit.com/user/coria/coria-backend

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:

... clone the repository ...
... make some changes and some commits ...
git push origin main