RocketGit

coria / coria-backend (public) (License: Unspecified) (since 2017-02-23) (hash sha1)

No description available

Clone URLs: https://rocketgit.com/user/coria/coria-backend ssh://rocketgit@ssh.rocketgit.com/user/coria/coria-backend git://git.rocketgit.com/user/coria/coria-backend

feature/coria-ver1 feature/coria-ver1.5 feature/coria-ver2 master

List of commits:

Subject	Hash	Author	Date (UTC)
Commig	cdebeb923331f9081529a023c00cb1f0543e3d55	Mathias Ehlert	2014-12-07 16:07:08

Commit cdebeb923331f9081529a023c00cb1f0543e3d55 - Commig
Author: Mathias Ehlert
Author date (UTC): 2014-12-07 16:07
Committer name: Mathias Ehlert
Committer date (UTC): 2014-12-07 16:07
Parent(s):
Signer:
Signing key:
Signing status: N
Tree: e7ad3f3e3de345cdeb26e5e821602ffb64533857

File	Lines added	Lines deleted
README.md	7	0
__init__.py	0	0
advancedscores.py	33	0
advancedscores.pyc	0	0
config.py	84	0
config.pyc	0	0
file_importer.py	117	0
file_importer.pyc	0	0
graph tool test.py	99	0
indexing.py	22	0
indexing.pyc	0	0
log	644344	0
metric_calculator.py	179	0
metric_calculator.pyc	0	0
metrics.py	195	0
metrics.pyc	0	0
normalizations.py	37	0
normalizations.pyc	0	0
pearson.py	45	0
start.py	43	0
statistics.py	56	0
statistics.pyc	0	0

File README.md added (mode: 100644) (index 0000000..924a1df)
	1	coria-backend
	2	=============
	3
	4	Connectivity Risk Analysis Python Backend
	5
	6	usage: start.py [-h] filename
	7

File __init__.py added (mode: 100644) (index 0000000..e69de29)

File advancedscores.py added (mode: 100644) (index 0000000..489636a)
	1	# advancedscores.py
	2	import numpy as np
	3
	4	################
	5	#advanced scores
	6	################
	7
	8	def adv_unified_risk_score(self):
	9
	10	#caching of all values in dictionaries
	11	all_ccs_normalized = dict(self.redis.zrange(self.metric_prefix+'corrected_clustering_coefficient'+self.normalization_suffix, 0, -1, withscores=True, score_cast_func=float))
	12	all_urs = dict(self.redis.zrange(self.score_prefix+'unified_risk_score', 0, -1, withscores=True, score_cast_func=float))
	13
	14	urs_percentile_10 = np.percentile(all_urs.values(), 10)
	15	urs_percentile_90 = np.percentile(all_urs.values(), 90)
	16
	17	for node in self.nodes:
	18	cc_normalized = all_ccs_normalized[str(node)]
	19	urs = all_urs[str(node)]
	20
	21
	22	if (urs >= urs_percentile_90 or urs <= urs_percentile_10):
	23	if (cc_normalized >= 0.25):
	24	advanced_unified_risk_score = ((urs * 3.0) + cc_normalized) / 4.0
	25	else:
	26	advanced_unified_risk_score = urs
	27	else:
	28	advanced_unified_risk_score = urs
	29
	30	#save for node
	31	self.redis.hset(self.node_prefix+str(node), 'advanced_unified_risk_score', advanced_unified_risk_score)
	32	#save for score
	33	self.redis.zadd(self.score_prefix+'advanced_unified_risk_score', advanced_unified_risk_score, str(node))

File advancedscores.pyc added (mode: 100644) (index 0000000..ce98f24)

File config.py added (mode: 100644) (index 0000000..8410e0b)
	1	#config.py
	2	import metrics
	3	import normalizations
	4	import advancedscores
	5
	6	#redis keys for indexes and values
	7	node_index_key = 'all_nodes'
	8	metric_index_key = 'all_metrics'
	9	score_index_key = 'all_scores'
	10
	11	node_neighbors_prefix = 'node_neighbors:'
	12	node_prefix = 'node_metrics:'
	13	metric_prefix = 'metric:'
	14	score_prefix = 'score:'
	15	statistics_prefix = 'statistics:'
	16
	17	normalization_suffix = '_normalized'
	18
	19	# definition of all base metrics for which absolute values will be calculcated for each node in the first step
	20	# key is the name of the metric and value is the implemented method which exposes the required interface
	21	# interface: each method takes the node as the single parameter, performs the necessary calculation and
	22	# returns a float containing the value for the specified node
	23
	24	base_metrics = { 'clustering_coefficient' : metrics.clustering_coefficient,
	25	'degree' : metrics.degree,
	26	'average_neighbor_degree' : metrics.average_neighbor_degree,
	27	'iterated_average_neighbor_degree': metrics.iterated_average_neighbor_degree,
	28	# 'betweenness_centrality' : metrics.betweenness_centrality,
	29	'betweenness_centrality_gt' : metrics.betweenness_centrality_gt,
	30	# 'eccentricity' : metrics.eccentricity,
	31	'average_shortest_path_length' : metrics.average_shortest_path_length
	32	}
	33
	34
	35	# some metrics might require some corrections or post processing which relies on the value of other metrics or normalizations
	36	# key is the metric name and value the method for correction
	37
	38	advanced_metrics = {'corrected_clustering_coefficient' : metrics.correct_clustering_coefficient,
	39	'corrected_average_neighbor_degree' : metrics.correct_average_neighbor_degree,
	40	'corrected_iterated_average_neighbor_degree': metrics.correct_iterated_average_neighbor_degree}
	41
	42
	43	# for every metric, a normalization method has to be specified
	44	# key is the name of the metric and value is the normalization method which also has to expose the required interface
	45	# interface: normalization methods, take the name of the (absolute) metric as the single argument, no return value is required
	46	# the method itself shall access the data which is required for normalization from the redis instance
	47	# and the corresponding keys/values for the specified metric
	48	# it shall then loop over all nodes and calculate the normalized value for the node and the metric
	49	# afterwards it should save the result to redis using "metric_name_normalized" as the key
	50	# the result is stored inside the node's hash for metrics
	51
	52	# also needs to include corrected metrics with their respective names
	53	#
	54	normalization_methods = { 'clustering_coefficient' : normalizations.min_max,
	55	'corrected_clustering_coefficient' : normalizations.min_max,
	56	'degree' : normalizations.min_max,
	57	'average_neighbor_degree' : normalizations.min_max,
	58	'corrected_average_neighbor_degree' : normalizations.min_max,
	59	'iterated_average_neighbor_degree' : normalizations.min_max,
	60	'corrected_iterated_average_neighbor_degree': normalizations.min_max,
	61	# 'betweenness_centrality' : normalizations.min_max,
	62	'betweenness_centrality_gt' : normalizations.min_max,
	63	# 'eccentricity' : normalizations.max_min,
	64	'average_shortest_path_length' : normalizations.max_min
	65	}
	66
	67
	68	# the easiest case for a score is a combination of normalized metric values with a weight which adds up to 1
	69	# such scores can easily be defined here
	70	# note: names are not methods but redis keys
	71
	72	scores = {'unified_risk_score': { 'degree': 0.25,
	73	'corrected_average_neighbor_degree': 0.15,
	74	'corrected_iterated_average_neighbor_degree': 0.1,
	75	'betweenness_centrality_gt': 0.25,
	76	# 'eccentricity': 0.125,
	77	'average_shortest_path_length': 0.25}
	78	}
	79
	80
	81	# other scores might require a more sophisticated algorithm to be calculated
	82	# such scores need to be added here and implemented like the example below
	83
	84	advanced_scores = {'advanced_unified_risk_score': advancedscores.adv_unified_risk_score}

File config.pyc added (mode: 100644) (index 0000000..80b9cca)

File file_importer.py added (mode: 100644) (index 0000000..68b7ae9)
	1	import networkx as nx
	2	import graph_tool.all as gt
	3
	4	class FileImporter(object):
	5	def __init__(self,filename):
	6	# initialize data file to parse and new empty graph
	7	print ('Starting file importer!')
	8	self.data_file = open(filename)
	9	self.graph = nx.Graph()
	10	self.graph_gt = gt.Graph(directed=False)
	11	self.graph_gt_labels = self.graph_gt.new_vertex_property("double")
	12
	13	def read(self):
	14	for line in self.data_file:
	15	print("Parsing line",line)
	16	self.parse_line(line)
	17	return self.graph
	18	# return {'graph':self.graph, 'graph_gt':self.graph_gt, 'graph_gt_labels':self.graph_gt_labels}
	19	#self.graph,self.graph_gt,self.graph_gt_labels
	20
	21	def read_gt(self):
	22	return {'graph_gt':self.graph_gt, 'graph_gt_labels':self.graph_gt_labels}
	23
	24	def parse_line(self, line):
	25	# split each line on tabstop
	26	# first field specifies the source node
	27	# second field specifies the target node
	28
	29	fields = line.strip().split("\t")
	30	from_node = int(fields[0])
	31	to_node = int(fields[1])
	32
	33	# print('\n')
	34	# print('From node is',from_node)
	35	# print('To node is',to_node)
	36	# add edge to the networkx graph
	37	if (from_node <> to_node):
	38	self.graph.add_edge(from_node, to_node)
	39	# print('Network X graph has the following number of nodes',self.graph.number_of_nodes())
	40	# print('Network X graph has the following number of edges',self.graph.number_of_edges())
	41
	42
	43
	44	#add edge to the graph_tool graph and create a property map of labels
	45	#check if nodes are already present and create new ones if not
	46	#temp = gt.Graph(directed=False)
	47	#temp_name = temp.new_vertex_property("string")
	48	temp = self.graph_gt
	49	temp_name = self.graph_gt_labels
	50
	51	check = None
	52	if (from_node <> to_node): #check if from_node is the same as to_node
	53	index_from = gt.find_vertex(temp,temp_name,from_node)
	54	# print('Index from is',index_from)
	55	index_to = gt.find_vertex(temp,temp_name,to_node)
	56	# print('Index to is',index_to)
	57	if (index_from == [] and index_to == []):
	58	# print('No idences are found')
	59	c1 = temp.add_vertex()
	60	temp_name[temp.vertex(c1)] = from_node
	61	# print('Temp_name is now',temp_name[temp.vertex(c1)])
	62	c2 = temp.add_vertex()
	63	temp_name[temp.vertex(c2)] = to_node
	64	# print('Temp_name is now',temp_name[temp.vertex(c2)])
	65	if (index_from <> [] and index_to == []) :
	66	# print('Index from is')
	67	# print(index_from[0])
	68	c1 = index_from[0]
	69	#print('C1 is',c1)
	70	c2 = temp.add_vertex()
	71	#print('C2 is'),
	72	#print(c2)
	73	temp_name[temp.vertex(c2)] = to_node
	74	# print('Temp_name is now',temp_name[temp.vertex(c2)])
	75	if (index_to <> [] and index_from ==[]) :
	76	# print('Index to is')
	77	# print(index_to[0])
	78	c1 = temp.add_vertex()
	79	c2 = index_to[0]
	80	temp_name[temp.vertex(c1)] = from_node
	81	# print('Temp_name is now',temp_name[temp.vertex(c1)])
	82	if (index_from <> [] and index_to <> []) :
	83	# print('Both vertices found')
	84	c1 = index_to[0]
	85	c2 = index_from[0]
	86	check = temp.edge(c1,c2) #check if the edge is already present
	87	# print('Check is',check)
	88	if (check == None):
	89	# print("Adding edge between",c1,"and",c2)
	90	temp.add_edge(c1, c2)
	91
	92	#print(temp_name)
	93	self.graph_gt = temp
	94	self.graph_gt_labels = temp_name
	95
	96	# Check whether GT and NetworkX graphs have the same number of nodes and edges
	97	# if (self.graph_gt.num_vertices() <> self.graph.number_of_nodes()):
	98	# print('Unequal number of vertices detected at from node',from_node,'to node',to_node)
	99	# print('Number of vertices in Gt Graph is',self.graph_gt.num_vertices())
	100	# print('Number of vertices in NetworkX is',self.graph.number_of_nodes())
	101	# else:
	102	# print('Equal number of vertices in both graphs')
	103
	104	# if (self.graph_gt.num_edges() <> self.graph.number_of_edges()):
	105	# print('Unequal number of edges detected at from node',from_node,'to node',to_node)
	106	# print('Number of vertices in Gt Graph is',self.graph_gt.num_edges())
	107	# print('Number of vertices in NetworkX is',self.graph.number_of_edges())
	108	# else:
	109	# print('Equal number of edges in both graphs')
	110
	111	# if (self.graph.number_of_nodes() <> self.graph_gt.
	112	# print('Graph tool graph is',self.graph_gt)
	113	# print('Graph tool labels map is',self.graph_gt_labels)
	114
	115
	116
	117

File file_importer.pyc added (mode: 100644) (index 0000000..0d69976)

File graph tool test.py added (mode: 100644) (index 0000000..b09bec7)
	1	import graph_tool.all as gt
	2	import networkx as nx
	3	import matplotlib as mp
	4	import matplotlib.pyplot as plt
	5
	6	g = nx.Graph()
	7	g.add_edge(1,2)
	8	g.add_edge(2,3)
	9	g.add_edge(1,8)
	10	print(g.nodes())
	11	print(g.edges())
	12	adj = nx.adjacency_matrix(g)
	13	print(adj)
	14
	15	test = gt.Graph(directed=False)
	16	test_name = test.new_vertex_property("string")
	17	c1=test.add_vertex()
	18	test_name[test.vertex(c1)] = "A"
	19	c2=test.add_vertex()
	20	test_name[test.vertex(c2)] = "B"
	21	c=test.add_vertex()
	22	test_name[test.vertex(c)] = "C"
	23	test.add_edge(c1, c2)
	24	gt.graph_draw(test)
	25	found = gt.find_vertex(test,test_name,"C")
	26	print(found)
	27	if found <> []:
	28	print("Index of B is")
	29	print(found[0])
	30	if (int(found[0])==1):
	31	print("True")
	32	else :
	33	print("False")
	34
	35	#print('Index of B is',vertex.pop(0))
	36
	37	for v in test.vertices():
	38	print v
	39	print test.vertex(v)
	40	print test_name[v]
	41	# print(vp[test.vertex(v)])
	42
	43
	44	def convert_graph(g):
	45	#converts a networkX graph to graph_tool
	46	#important : NetworkX node indexes start with 1, whereas Graph tool node indexes start with 0
	47	j = gt.Graph(directed=False)
	48	j.vertex_index
	49	j.add_vertex(len(adj))
	50	num_vertices = adj.shape[0]
	51	print (num_vertices)
	52	for i in range(num_vertices - 1):
	53	for l in range(i + 1, num_vertices):
	54	if adj[i,l] != 0:
	55	j.add_edge(i, l)
	56	return j
	57
	58	j = convert_graph(g)
	59
	60	for v in j.vertices():
	61	print(v)
	62	for e in j.edges():
	63	print(e)
	64
	65	bg = nx.betweenness_centrality(g)
	66	vp,ep = gt.betweenness(j)
	67
	68	print(bg)
	69
	70	#for u in range(1,len(bg)+1):
	71	# print u
	72	# print(bg[u])
	73
	74	for v in j.vertices():
	75	print v
	76	print(vp[j.vertex(v)])
	77
	78	#nx.draw(g)
	79	#plt.draw()
	80	#plt.show()
	81	#gt.graph_draw(j)
	82
	83
	84	#g = gt.collection.data["polblogs"]
	85	#v1 = g.add_vertex()
	86	#v2 = g.add_vertex()
	87	#v3 = g.add_vertex()
	88	#e = g.add_edge(v2, v1)
	89	#f = g.add_edge(v3, v1)
	90	#print(v1.out_degree())
	91	#vp,ep = gt.betweenness(g)
	92	#print(vp[g.vertex(1)])
	93	#print(vp[g.vertex(2)])
	94	#print(vp)
	95	#print(type(vp))
	96	#gt.graph_draw(g)
	97	#print(vp[1],ep[1])
	98	#gt.graph_draw(g.vp)
	99	#gt.graph_draw(g, vertex_text=g.vertex_index, vertex_font_size=18,output_size=(200, 200), output="two-nodes.png")

File indexing.py added (mode: 100644) (index 0000000..359eb71)
	1	#indexing
	2	def index_nodes(self):
	3	self.redis.sadd(self.node_index_key, self.nodes)
	4
	5	def index_neighbors(self):
	6	for node in self.nodes:
	7	node_neighbors = self.graph.neighbors(int(node))
	8	self.redis.sadd(self.node_neighbors_prefix+str(node), node_neighbors)
	9
	10	def index_metrics(self):
	11	for metric in self.base_metrics:
	12	self.redis.sadd(self.metric_index_key, metric)
	13
	14	for advanced_metric in self.advanced_metrics:
	15	self.redis.sadd(self.metric_index_key, advanced_metric)
	16
	17	def index_scores(self):
	18	for score in self.scores:
	19	self.redis.sadd(self.score_index_key, score)
	20
	21	for advanced_score in self.advanced_scores:
	22	self.redis.sadd(self.score_index_key, advanced_score)

File indexing.pyc added (mode: 100644) (index 0000000..fca8491)

The diff for file log is too big (644344 changes) and cannot be shown.

File metric_calculator.py added (mode: 100644) (index 0000000..c2cc665)
	1	import networkx as nx
	2	import graph_tool.all as gt
	3	import redis as rd
	4	import numpy as np
	5	import indexing
	6	import statistics
	7	import normalizations
	8	import config
	9	import datetime as dt
	10
	11
	12	class MetricCalculator(object):
	13	def __init__ (self, graph,graph_gt):
	14	#class constructor
	15	#define required class variables such as the graph to work on, the redis connection and the nodes of the graph
	16
	17	print ('Starting metric_calculator!')
	18	self.graph = graph
	19	self.graph_gt = graph_gt
	20	# self.graph_gt_labels = graph_gt_labels
	21	self.redis = rd.StrictRedis(host='localhost', port=6379, db=0)
	22	self.nodes = nx.nodes(graph)
	23
	24
	25	# configuration variables are read from the config file and are also saved to class variables for easy access
	26	self.node_index_key = config.node_index_key
	27	self.metric_index_key = config.metric_index_key
	28	self.score_index_key = config.score_index_key
	29
	30	self.node_neighbors_prefix = config.node_neighbors_prefix
	31	self.node_prefix = config.node_prefix
	32	self.metric_prefix = config.metric_prefix
	33	self.score_prefix = config.score_prefix
	34	self.statistics_prefix = config.statistics_prefix
	35
	36	self.normalization_suffix = config.normalization_suffix
	37
	38	self.base_metrics = config.base_metrics
	39	self.advanced_metrics = config.advanced_metrics
	40
	41	self.normalization_methods = config.normalization_methods
	42
	43	self.scores = config.scores
	44	self.advanced_scores = config.advanced_scores
	45
	46
	47
	48	def start(self):
	49	#clean all data in Redis
	50	self.redis.flushdb()
	51
	52	#index creation
	53	self.create_indexes()
	54
	55
	56	#main calculations
	57	self.calculate_metrics()
	58	self.calculate_advanced_metrics()
	59	self.normalize_metrics()
	60	self.calculate_scores()
	61	self.calculate_advanced_scores()
	62
	63	#statistics
	64	self.calculate_statistics()
	65
	66	##################
	67	#### INDEXING ####
	68	##################
	69	def create_indexes(self):
	70	#call methods defined in indexing.py
	71	indexing.index_nodes(self)
	72	indexing.index_neighbors(self)
	73	indexing.index_metrics(self)
	74	indexing.index_scores(self)
	75
	76	###########################
	77	#### CALCULATION LOOPS ####
	78	###########################
	79
	80	def calculate_metrics(self):
	81	# loop through all defined metrics and call specified calculation method for each node
	82	print ('Starting calculate_metrics')
	83	for metric_name in self.base_metrics:
	84	metric_method = self.base_metrics[metric_name]
	85
	86	# loop through all nodes
	87	for node in self.nodes:
	88	# call calculation method of supplied metric for current node
	89	node = int(node)
	90	value = float(metric_method(self,node))
	91
	92	#store result in node values
	93	self.redis.hset(self.node_prefix+str(node), metric_name, value)
	94
	95	#also store result to metric set
	96	self.redis.zadd(self.metric_prefix+metric_name, value, str(node))
	97
	98
	99	def calculate_advanced_metrics(self):
	100	# loop through all defined_advanced_metrics and call specified calculation method
	101	print ('Starting calculate_advanced_metrics')
	102	for advanced_metric_name in self.advanced_metrics:
	103	metric_method = self.advanced_metrics[advanced_metric_name]
	104
	105	# loop through all nodes
	106	for node in self.nodes:
	107	node = int(node)
	108	value = float(metric_method(self,node))
	109
	110	#store result in node values
	111	self.redis.hset(self.node_prefix+str(node), advanced_metric_name, value)
	112
	113	#also store result to metric set
	114	self.redis.zadd(self.metric_prefix+advanced_metric_name, value, str(node))
	115
	116
	117	# loop through all defined normalizations and call respective normalization method
	118	# no default normalizations for metrics not listed in the "normalization_methods" hash
	119	def normalize_metrics(self):
	120	#fallback normalization: min-max
	121	print ('Starting normalize_metrics')
	122	all_metrics = dict(self.base_metrics.items() + self.advanced_metrics.items())
	123
	124	for metric_name in all_metrics:
	125	if self.normalization_methods.has_key(metric_name):
	126	normalization_method = self.normalization_methods[metric_name]
	127	else:
	128	#fallback normalization is min-max
	129	normalization_method = normalizations.min_max
	130	normalization_method(self,metric_name)
	131
	132
	133	def calculate_scores(self):
	134	print ('Starting calculate_scores')
	135	for score_name in self.scores:
	136	metrics_with_weights = self.scores[score_name]
	137
	138	for node in self.nodes:
	139	score_value = 0.0
	140
	141	# get normalized values
	142	for metric in metrics_with_weights:
	143	weight = self.scores[score_name][metric]
	144	value = float(self.redis.hget(self.node_prefix+str(node),metric+self.normalization_suffix))
	145	score_value += weight * value
	146
	147	self.redis.hset(self.node_prefix+str(node),score_name, score_value)
	148	self.redis.zadd(self.score_prefix+score_name, score_value, str(node))
	149
	150	def calculate_advanced_scores(self):
	151	print ('Starting calculate_advanced_scores')
	152	for advanced_score in self.advanced_scores:
	153	self.advanced_scores[advanced_score](self)
	154
	155
	156	#############
	157	# statistics
	158	#############
	159
	160	def calculate_statistics(self):
	161	print ('Starting calculate_statistics')
	162	for metric in self.base_metrics:
	163	#absolute and normalized
	164	statistics.calculate_statistics(self, metric, self.metric_prefix+metric)
	165	statistics.calculate_statistics(self, metric+self.normalization_suffix, self.metric_prefix+metric+self.normalization_suffix)
	166
	167	for advanced_metric in self.advanced_metrics:
	168	#absolute and normalized
	169	statistics.calculate_statistics(self, advanced_metric, self.metric_prefix+advanced_metric)
	170	statistics.calculate_statistics(self, advanced_metric+self.normalization_suffix, self.metric_prefix+advanced_metric+self.normalization_suffix)
	171
	172	for score in self.scores:
	173	statistics.calculate_statistics(self, score, self.score_prefix+score)
	174
	175	for advanced_score in self.advanced_scores:
	176	statistics.calculate_statistics(self, advanced_score, self.score_prefix+advanced_score)
	177
	178	statistics.calculate_correlations(self)
	179

File metric_calculator.pyc added (mode: 100644) (index 0000000..4854fd8)

File metrics.py added (mode: 100644) (index 0000000..6673fc8)
	1	#metrics.py
	2	import networkx as nx
	3	import numpy as np
	4	import datetime as dt
	5	import graph_tool.all as gt
	6
	7	def clustering_coefficient(self,node):
	8	print ('Calculating clustering_coefficient for node',node)
	9	#in the first run calculate the metric for all nodes at once and save in a hash of the instance to access later
	10	#NOTE: this should result in a performance gain, but for very large graphs this might be a problem.
	11	# in this case, just returning nx.clustering(self.graph, node) might be better
	12	if not hasattr(self, 'all_clustering_coefficients'):
	13	self.all_clustering_coefficients = nx.clustering(self.graph)
	14
	15	#get the actual value from the pre-calculated hash
	16	return self.all_clustering_coefficients[node]
	17
	18	def degree(self, node):
	19	print('Calculating degree for node', node)
	20	return self.graph.degree(node)
	21
	22
	23	def average_neighbor_degree(self,node):
	24	print('Calculating average_neighbour_degree for node',node)
	25	# same caching technique as in self.clustering_coefficient
	26	# might also break for very large graphs
	27	# nx.average_neighbor_degree(self.graph, nodes=node) might be the way to go
	28
	29	if not hasattr(self, 'all_average_neighbor_degrees'):
	30	self.all_average_neighbor_degrees = nx.average_neighbor_degree(self.graph)
	31	return self.all_average_neighbor_degrees[node]
	32
	33	def iterated_average_neighbor_degree(self, node):
	34	print('Calculating iterated_average_neighbor degree for node',node)
	35	first_level_neighbors = self.graph.neighbors(node)
	36	# print ('First level neigbors are', first_level_neighbors)
	37	second_level_neighbors = []
	38	# print ('Second level neigbors are', second_level_neighbors)
	39	# get all two-hop nodes
	40	for first_level_neighbor in first_level_neighbors:
	41	current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
	42	second_level_neighbors.extend(current_second_level_neighbors)
	43
	44	#remove one-hop nodes and self
	45	relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
	46
	47	degree_sum = 0
	48	for relevant_node in relevant_nodes:
	49	degree_sum += self.graph.degree(relevant_node)
	50
	51	if float(len(relevant_nodes)) <> 0:
	52	return float(degree_sum)/float(len(relevant_nodes))
	53	else:
	54	return 0
	55
	56	def eccentricity(self, node):
	57	print('Calculating eccentricity for node', node)
	58	if not hasattr(self, 'all_eccentricities'):
	59	l = gt.label_largest_component(self.graph_gt['graph_gt'],directed = None) #find the largest component
	60	print ('Found the largest component')
	61	# print ("Printing labeled largest component",l.a)
	62	u = gt.GraphView(self.graph_gt['graph_gt'], vfilt=l) # extract the largest component as a graph
	63	print('The number of vertices in the largest component is',u.num_vertices())
	64	print('The number of vertices in the original graph is', nx.number_of_nodes(self.graph))
	65	# if nx.is_connected(self.graph) == True:
	66	if (u.num_vertices() == nx.number_of_nodes(self.graph)):
	67	print ("Graph is connected")
	68	self.all_eccentricities = nx.eccentricity(self.graph)
	69	print ("Calculated all eccentricities")
	70	# print("Eccentricities are",self.all_eccentricities)
	71	return self.all_eccentricities[node]
	72	else:
	73	# return 0
	74	print("Graph is disconnected")
	75	self.all_eccentricities = {}
	76	if (self.all_eccentricities != {}):
	77	print("Returning eccentricity for",node,"-",self.all_eccentricities[node])
	78	return self.all_eccentricities[node]
	79	else:
	80	print("Returning 0")
	81	return 0
	82
	83	def betweenness_centrality(self, node):
	84	print('Calculating betweenness_centrality for node',node)
	85	if not hasattr(self, 'all_betweenness_centralities'):
	86	self.all_betweenness_centralities = nx.betweenness_centrality(self.graph)
	87	return self.all_betweenness_centralities[node]
	88
	89
	90	def betweenness_centrality_gt(self, node):
	91	print('Calculating betweenness_centrality with graph_tool for node',node)
	92	# print('Self is',self.graph_gt['graph_gt'])
	93	# print('Self is also',self.graph_gt['graph_gt_labels'])
	94	# def convert_graph(g):
	95	#converts a networkX graph to graph_tool
	96	#important : NetworkX node indexes start with 1, whereas Graph tool node indexes start with 0
	97	# adj = nx.adjacency_matrix(g)
	98	# j = gt.Graph(directed=False)
	99	# j.add_vertex(len(adj))
	100	# num_vertices = adj.shape[0]
	101	# for i in range(num_vertices - 1):
	102	# for l in range(i + 1, num_vertices):
	103	# if adj[i,l] != 0:
	104	# j.add_edge(i, l)
	105	# return j
	106
	107
	108	if not hasattr(self, 'all_betweenness_centralities_gt'):
	109	vp,ep = gt.betweenness(self.graph_gt['graph_gt'])
	110	self.all_betweenness_centralities_gt = vp
	111
	112	node_label = gt.find_vertex(self.graph_gt['graph_gt'],self.graph_gt['graph_gt_labels'],node)
	113	# print("Node",node,"has index",node_label)
	114	# print('Vp is',vp)
	115	# print('Betweenness centrality of node',node,'is',vp[self.graph_gt['graph_gt'].vertex(node_label[0])])
	116
	117	return self.all_betweenness_centralities_gt[self.graph_gt['graph_gt'].vertex(node_label[0])]
	118
	119	def average_shortest_path_length(self, node):
	120	print('Calculating average_shortes_path_length for node',node)
	121	# caching average_shortest_path_length for all nodes at one failed
	122	# already switched to single calculation
	123
	124	#get all shortest path lengths
	125	all_shortest_path_lengths_for_node = nx.shortest_path_length(self.graph, source=node)
	126
	127	#calculate average
	128	sum_of_lengths = 0
	129	for target in all_shortest_path_lengths_for_node:
	130	sum_of_lengths += all_shortest_path_lengths_for_node[target]
	131
	132	return float(sum_of_lengths)/len(all_shortest_path_lengths_for_node)
	133
	134
	135	#############
	136	# advanced metrics
	137	#############
	138	def correct_clustering_coefficient(self,node):
	139	print('Calculating correct_clustering_coefficient for node',node)
	140	clustering_coefficient = float(self.redis.hget(self.node_prefix+str(node),'clustering_coefficient'))
	141	degree = float(self.redis.hget(self.node_prefix+str(node), 'degree'))
	142	corrected_cc = clustering_coefficient + (degree * clustering_coefficient) / float(4)
	143	return corrected_cc
	144
	145	def correct_average_neighbor_degree(self,node):
	146	print('Calculating correct_average_neighbor degree for node',node)
	147	avgnd = float(self.redis.hget(self.node_prefix+str(node), 'average_neighbor_degree'))
	148
	149	neighbors = self.graph.neighbors(node)
	150	number_of_neighbors = float(len(neighbors))
	151	neighbor_degrees = []
	152	for neighbor in neighbors:
	153	neighbor_degrees.append(self.graph.degree(neighbor))
	154
	155	#using numpy median and standard deviation implementation
	156	numpy_neighbor_degrees = np.array(neighbor_degrees)
	157	median = np.median(numpy_neighbor_degrees)
	158	standard_deviation = np.std(numpy_neighbor_degrees)
	159
	160	if avgnd == 0.0 or number_of_neighbors == 0.0 or standard_deviation == 0.0:
	161	return avgnd
	162	else:
	163	return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_neighbors ) * avgnd
	164
	165
	166	def correct_iterated_average_neighbor_degree(self, node):
	167	print('Calculating correct_iterated_avverage_neighbour_gegree for node',node)
	168	avgnd = float(self.redis.hget(self.node_prefix+str(node), 'iterated_average_neighbor_degree'))
	169
	170	first_level_neighbors = self.graph.neighbors(node)
	171	second_level_neighbors = []
	172
	173	# get all two-hop nodes
	174	for first_level_neighbor in first_level_neighbors:
	175	current_second_level_neighbors = self.graph.neighbors(first_level_neighbor)
	176	second_level_neighbors.extend(current_second_level_neighbors)
	177
	178	#remove one-hop neighbors and self
	179	relevant_nodes = set(second_level_neighbors) - set(first_level_neighbors) - set([node])
	180
	181	number_of_nodes = len(relevant_nodes)
	182	node_degrees = []
	183	for rel_node in relevant_nodes:
	184	node_degrees.append(self.graph.degree(rel_node))
	185
	186	numpy_node_degrees = np.array(node_degrees)
	187	median = np.median(numpy_node_degrees)
	188	standard_deviation = np.std(numpy_node_degrees)
	189
	190	if avgnd == 0.0 or number_of_nodes == 0.0 or standard_deviation == 0.0:
	191	return avgnd
	192	else:
	193	return avgnd + ( ((median - avgnd) / standard_deviation) / number_of_nodes ) * avgnd
	194
	195

File metrics.pyc added (mode: 100644) (index 0000000..0700311)

File normalizations.py added (mode: 100644) (index 0000000..a959a8c)
	1	#normalizations.py
	2	def min_max(self,metric_name):
	3	#perform min max normalization of specified metric for all nodes
	4	#min_max normalization
	5	#get min and max from redis
	6	x_min = self.redis.zrange(self.metric_prefix+metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
	7	x_max = self.redis.zrange(self.metric_prefix+metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
	8
	9	#print x_min
	10	#print x_max
	11
	12	for node in self.nodes:
	13	if x_min == x_max:
	14	x_normalized = 1.0
	15	else:
	16	x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
	17	x_normalized = (x - x_min) / (x_max - x_min)
	18
	19	#store value for node and metric
	20	self.redis.zadd(self.metric_prefix+metric_name+self.normalization_suffix, x_normalized, str(node))
	21	self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)
	22
	23	#max min normalization
	24	def max_min(self,metric_name):
	25	x_min = self.redis.zrange(self.metric_prefix+metric_name, 0, 0, withscores=True, score_cast_func=float)[0][1]
	26	x_max = self.redis.zrange(self.metric_prefix+metric_name, -1, -1, withscores=True, score_cast_func=float)[0][1]
	27
	28	for node in self.nodes:
	29	if x_min == x_max:
	30	x_normalized = 1.0
	31	else:
	32	x = float(self.redis.hget(self.node_prefix+str(node), metric_name))
	33	x_normalized = (x_max - x) / (x_max - x_min)
	34
	35	#store value for node and metric
	36	self.redis.zadd(self.metric_prefix+metric_name+self.normalization_suffix, x_normalized, str(node))
	37	self.redis.hset(self.node_prefix+str(node),metric_name+self.normalization_suffix, x_normalized)

File normalizations.pyc added (mode: 100644) (index 0000000..b814a04)

File pearson.py added (mode: 100644) (index 0000000..7a6cc1c)
	1	import redis as rd
	2	import numpy as np
	3	from scipy.stats import pearsonr
	4
	5	metrics = ['clustering_coefficient',
	6	'degree',
	7	'average_neighbor_degree',
	8	'iterated_average_neighbor_degree',
	9	'betweenness_centrality',
	10	'eccentricity',
	11	'average_shortest_path_length',
	12	'corrected_clustering_coefficient',
	13	'corrected_average_neighbor_degree',
	14	'corrected_iterated_average_neighbor_degree']
	15
	16	rdb = rd.StrictRedis(host='localhost', port=6379, db=0)
	17
	18
	19	correlations = {}
	20	for metric1 in metrics:
	21	correlations[metric1] = {}
	22	for metric2 in metrics:
	23	correlations[metric1][metric2] = (0,0)
	24	if metric1 == metric2:
	25	correlations[metric1][metric2] = (1,0)
	26	continue
	27
	28	dict_metric1 = dict(rdb.zrange(metric1, 0, -1, withscores=True, score_cast_func=float))
	29	dict_metric2 = dict(rdb.zrange(metric2, 0, -1, withscores=True, score_cast_func=float))
	30
	31	values_metric1 = []
	32	values_metric2 = []
	33
	34	for key in sorted(dict_metric1.iterkeys()):
	35	values_metric1.append(dict_metric1[key])
	36
	37	for key in sorted(dict_metric2.iterkeys()):
	38	values_metric2.append(dict_metric2[key])
	39
	40	correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
	41
	42	for source in correlations:
	43	for target in correlations[source]:
	44	rdb.hset("correlations:"+source+":"+target, "correlation", correlations[source][target][0])
	45	rdb.hset("correlations:"+source+":"+target, "confidence", correlations[source][target][1])

File start.py added (mode: 100644) (index 0000000..26df05e)
	1	#!/usr/bin/env python
	2	import datetime
	3	import argparse
	4	import cProfile, pstats, StringIO
	5	from file_importer import FileImporter
	6	from metric_calculator import MetricCalculator
	7	import datetime as dt
	8
	9	print 'Starting metric calculation',dt.datetime.now()
	10	parser = argparse.ArgumentParser(description='Read a Tab-separated Graph Datafile and start Calculation of Metrics and Statistics as configured in config.py')
	11
	12	parser.add_argument('filename', metavar='filename', type=str,
	13	help='the name of the data file containing tab separated node ids')
	14
	15	parser.add_argument('--profiling',dest='profiling',action='store_true', help='enable runtime profiling into profiling.txt file')
	16
	17	args = parser.parse_args()
	18
	19	if args.profiling:
	20	pr = cProfile.Profile()
	21	s = StringIO.StringIO()
	22	timestamp = str(datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))
	23	outfile = open('profiling_output_'+timestamp+'.txt', 'w')
	24	pr.enable()
	25
	26	fi = FileImporter(args.filename)
	27	graph = fi.read()
	28	#print('This should be a Network X graph',graph)
	29	print('Network X graph has the following number of nodes',graph.number_of_nodes())
	30	print('Network X graph has the following number of edges',graph.number_of_edges())
	31	graph_gt = fi.read_gt()
	32	print('Graph tool graph has the following number of nodes',graph_gt['graph_gt'].num_vertices())
	33	print('Graph tool graph has the following number of edges',graph_gt['graph_gt'].num_edges())
	34	#print('Gt graph has the following properties')
	35	mc = MetricCalculator(graph,graph_gt)
	36	mc.start()
	37
	38	if args.profiling:
	39	ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
	40	ps.print_stats()
	41	outfile.write(s.getvalue())
	42
	43	print 'Ending metric calculation',dt.datetime.now()

File statistics.py added (mode: 100644) (index 0000000..fb03eaa)
	1	#statistics.py
	2	import redis as rd
	3	import numpy as np
	4	from scipy.stats import pearsonr
	5
	6	def calculate_statistics(self,metric,redis_key):
	7	all_values = dict(self.redis.zrange(redis_key, 0, -1, withscores=True, score_cast_func=float)).values()
	8	min_value = np.min(all_values)
	9	max_value = np.max(all_values)
	10
	11	average = np.average(all_values)
	12	median = np.median(all_values)
	13	standard_deviation = np.std(all_values)
	14
	15	self.redis.hset(self.statistics_prefix+metric, 'min', min_value)
	16	self.redis.hset(self.statistics_prefix+metric, 'max', max_value)
	17	self.redis.hset(self.statistics_prefix+metric, 'average', average)
	18	self.redis.hset(self.statistics_prefix+metric, 'median', median)
	19	self.redis.hset(self.statistics_prefix+metric, 'standard_deviation', standard_deviation)
	20
	21
	22	def calculate_correlations(self):
	23	m = self.base_metrics.keys()
	24	c = self.advanced_metrics.keys()
	25
	26	metrics = m + c
	27
	28	correlations = {}
	29	for metric1 in metrics:
	30	correlations[metric1] = {}
	31	for metric2 in metrics:
	32	correlations[metric1][metric2] = (0,0)
	33	if metric1 == metric2:
	34	correlations[metric1][metric2] = (1,0)
	35	continue
	36
	37	dict_metric1 = dict(self.redis.zrange(self.metric_prefix+metric1, 0, -1, withscores=True, score_cast_func=float))
	38	dict_metric2 = dict(self.redis.zrange(self.metric_prefix+metric2, 0, -1, withscores=True, score_cast_func=float))
	39	values_metric1 = []
	40	values_metric2 = []
	41
	42	for key in sorted(dict_metric1.iterkeys()):
	43	values_metric1.append(dict_metric1[key])
	44
	45	for key in sorted(dict_metric2.iterkeys()):
	46	values_metric2.append(dict_metric2[key])
	47
	48	correlations[metric1][metric2] = pearsonr(values_metric1,values_metric2)
	49
	50	values_metric1 = []
	51	values_metric2 = []
	52
	53	for source in correlations:
	54	for target in correlations[source]:
	55	self.redis.hset(self.statistics_prefix+"correlations:"+source+":"+target, "correlation", correlations[source][target][0])
	56	self.redis.hset(self.statistics_prefix+"correlations:"+source+":"+target, "confidence", correlations[source][target][1])

File statistics.pyc added (mode: 100644) (index 0000000..b85b89d)

Hints:
Before first commit, do not forget to setup your git environment:

git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):

git clone https://rocketgit.com/user/coria/coria-backend

Clone this repository using ssh (do not forget to upload a key first):

git clone ssh://rocketgit@ssh.rocketgit.com/user/coria/coria-backend

Clone this repository using git:

git clone git://git.rocketgit.com/user/coria/coria-backend

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:

... clone the repository ...
... make some changes and some commits ...
git push origin main