# Initial idea for an efficient algorithm

import random
import time
import numpy as np

# 1. Read In Problem instance
# 2. Preprocess the graph weights
# 3. Do max-TSP

# Parses the input of the challenge
#	@param:
#		file_path ... The path to the file, specifying the tags of the pictures
#	@return:
#		pics      ... The list of pictures with their orientation and tags
def parse_input(file_path):
	with open(file_path, 'r') as file:
		n = int(file.readline().strip().split()[0])
		print('Problem Size =',n) # Debug Information
		
		line = file.readline().strip().split()
		sum_tags = 0
		pics_h = []
		pics_v = []
		for i in range(n):
			tag_len = int(line[1])			
			sum_tags += tag_len
			tags = tag_len*[0]
			for j in range(int(line[1])):
				tags[j] = line[2+j]
			
			if line[0] == 'V':
				pics_v.append((i,tags))
			else:
				pics_h.append((i,tags))
			line = file.readline().strip().split()
		avg = int(sum_tags/n)
		print('Average tag length: ',avg) # Debug Information

		
	return prune(pics_v,avg), prune(pics_h,avg)


# Prunes the pictures, by removing those with a very below average number of tags
# A picture with a low number of tags is very prone to being fully included in other pictures
#	@param:
#		pics   ... The set of pictures
#		avg    ... The average number of tags in the set
#	@return:
#		pruned ... The reduced set of pictures
def prune(pics, avg):
	# Annihilation of bad pictures
	pruned = []
	prune_factor = int(avg/3) # Optimum to be tested
	for i in range(len(pics)):
		if len(pics[i][1]) > prune_factor: pruned.append(pics[i])

	print('Pruned',len(pics),'pictures to',len(pruned)) # Debug Information

	return pruned


# Calculates the score of the given pair of slides
# This metric uses the built in set operations instead of binary search
# FIRST STEP TOWARDS EFFICIENCY
#	@param:
#		slide1 ... The tags of the first slide
#		slide2 ... The tags of the next slide
#	@return:
#		metric ... The score as specified by the challenge
def metric_set(slide1,slide2): # only the tags
	# 1. Number of similar
	# 2. Number of diff in S1
	# 3. Number of diff in S2
	l1 = set(slide1)
	l2 = set(slide2)
	d1 = len(l1-l2)
	sim = len(l1)-d1
	d2 = len(l2)-sim
	metric = min(d1,d2,sim)
	return metric


# Calculates the score of the given pair of slides with a linear search
# If the first Slide has fewer tags, they get swapped
# ORIGINAL
#	@param:
#		slide1 ... The tags of the first slide
#		slide2 ... The tags of the next slide
#	@return:
#		metric ... The score as specified by the challenge
def metric_self(slide1,slide2): # only the tags
	# 1. Number of similar
	# 2. Number of diff in S1
	# 3. Number of diff in S2
	sim = 0

	if len(slide2) > len(slide1): # swap if fewer
		t = slide1
		slide1 = slide2
		slide2 = t

	for i in range(len(slide1)):
		if slide1[i] in slide2: sim += 1

	dif1 = len(slide1) - sim
	dif2 = len(slide2) - sim
	metric = min(dif1,dif2,sim)
	return metric


# Creates the adjacency matrix for the TSP algorithm
# This was done to minimize the effort while searching for an optimal solution
# This way we have for any TSP algorithm O(2n²) or O(n³+n²), instead of having to recalculate for every iteration
#	@param:
#		instance ... The problem instance
#	@result:
#		graph    ... The fully connected graph structure
def preprocessing(instance):
	n = len(instance)
	graph = n*[[]]
	q = int(n/4)
	h = int(n/2)
	t = int(3*n/4)
	for i in range(n):
		line = n*[0]
		for j in range(i+1,n):
			line[j] = metric_set(instance[i][1],instance[j][1])
		graph[i] = line
		if i == q: print('25% finished')
		elif i == h: print('50% finished')
		elif i == t: print('75% finished')

	for i in range(n):
		for j in range(i):
			graph[i][j] = graph[j][i]
	return graph


# Naive combination algorithm for better slides
# Only combines adjacent horizontal pictures without any heuristics
#	@param:
#		pics_v ... Vertical pictures
#	@return:
#		com    ... Merged pictures, names as list of 2 elements, set of tags as unique
def combine_vertical(pics_v):
	if len(pics_v) % 2 == 1:
		pics_v = pics_v[0:len(pics_v)-2]
	n = int(len(pics_v)/2)
	com = [0]*n
	for i in range(n):
		 com[i] = ([pics_v[2*i][0],pics_v[2*i+1][0]],list(set(pics_v[2*i][1] + pics_v[2*i+1][1])))

	return com


# ORIGINAL
# Greedy Exhaustive TSP, O(n³)
# Uses each element as first for the search of the longest path
#	@param:
#		graph ... The adjacency matrix of the pictures
#	@return:
#		path  ... The Hammiltonian trail in the graph
def max_tsp(graph): # greedy exhaustive TSP
	n = len(graph)
	best_path_l = -1
	best_path = []

	q = int(n/4)
	h = int(n/2)
	t = int(3*n/4)

	for c in range(n):
		path = [-1]*n
		path[0] = c
		cur = c
		length = 0
		for i in range(1,n):
			best = 0
			for j in range(n):
				if graph[cur][j] >= best and j not in path:
					best = graph[cur][j]
					path[i] = j
			cur = path[i]
			length += best
		if length >= best_path_l:
			best_path_l = length
			best_path = path
		
		if c == q: print('25% finished')
		elif c == h: print('50% finished')
		elif c == t: print('75% finished')

	return best_path


# SECOND BETTER METHOD
# Monte Carlo flavor of TSP, with randomized starting point
# n times faster than original method, but no guarantee for best value
# Train of thought: The first element has only one time direct influence on the overall points, therefore by assumption of a large problem space, we do not loose a lot of points by randomly selecting while gaining a lot of speed
#	@param:
#		graph ... The adjacency matrix of the pictures
#	@return:
#		path  ... The path of ordered pictures
def monte_carlo_tsp(graph): # greedy randomized TSP
	n = len(graph)
	path = [-1]*n
	start = random.randint(0,n-1)
	q = int(n/4)
	h = int(n/2)
	t = int(3*n/4)
	path[0] = start
	cur = start
	for i in range(1,n):
		best = 0
		for j in range(n):
			if graph[cur][j] >= best and j not in path:
				best = graph[cur][j]
				path[i] = j
		cur = path[i]
	
		if i == q: print('25% finished')
		elif i == h: print('50% finished')
		elif i == t: print('75% finished')

	return path


# THIRD VERSION OF TSP SOLVER
# Idea: Create an almost linear time priority queue for each element, so that the n/2 best matches are accessible in O(1)
# 	Then check if the most important element is not yet picked and add it to the path. This leads to a better best and average case complexity
#	@param:
#		graph ... The adjacency matrix of the pictures
#	@return:
#		path  ... The Hammiltonian trail in the graph
def priority_tsp(graph):
	n = len(graph)
	bests = int(n/3)
	best_in_slot = [-1]*n

	for i in range(n):
		l = [-1]*bests
		for j in range(bests):
			b = np.argmax(graph[i])
			l[j] = b
			graph[i][b] = -1
		best_in_slot[i] = l
	
	cur = random.randint(0,n-1)	
	path = [cur]
	last = -1
	for i in range(1,n):
		if cur == last: break
		last = cur
		for j in range(bests):
			best = best_in_slot[cur][j]
			if best not in path:
				cur = best
				path.append(best)
				break
	print('Omitted',n-len(path),'pictures')
	return path


# Fourth Version: Based on the findings of Prio_TSP, we calculate on demand now, saving a lot of time for the queues
# Uses a random element as first for the search of the longest path
#	@param:
#		graph ... The adjacency matrix of the pictures
#	@return:
#		path  ... The Hammiltonian trail in the graph
def greedy_tsp(graph):
	n = len(graph)
	cur = random.randint(0,n-1)	
	path = [cur]*n

	q = int(n/4)
	h = int(n/2)
	t = int(3*n/4)

	for i in range(1,n):
		for j in range(n):
			best = np.argmax(graph[cur])
			if best not in path:
				cur = best
				path[i] = best
				break
			else:
				graph[cur][best] = -1

		if i == q: print('25% finished')
		elif i == h: print('50% finished')
		elif i == t: print('75% finished')

	return path


# Seventh Optimization, does not use the graph anymore, extremely fast
# Uses a random element as first for the search of the longest path
#	@param:
#		instance ... The list of pictures
#	@return:
#		path     ... The Hammiltonian trail in the graph
def demand_tsp(instance):
	n = len(instance)
	cur = random.randint(0,n-1)	
	path = [cur]*n
	taken = [0]*n
	taken[cur] = 1

	q = int(n/4)
	h = int(n/2)
	t = int(3*n/4)

	for i in range(1,n):
		best = 0
		for j in range(n):
			if taken[j] == 0:
				metric = metric_set(instance[cur][1], instance[j][1])
				if best <= metric:
					best = metric
					path[i] = j

		if i == q: print('25% finished')
		elif i == h: print('50% finished')
		elif i == t: print('75% finished')
		cur = path[i]
		taken[cur] = 1

	return path


# Uses the correct names of the pictures in the order of the instance list
#	@param:
#		solution ... The path of the solution
#		instance ... The instance of the test
#	@return:
#		slides   ... The named slides
def name_path(solution, instance):
	slides = [0]*len(solution)
	for i in range(len(solution)):
		slides[i] = instance[solution[i]]
	return slides


# Automatic Evaluation Procedure
#	@param:
#		solution ... The path determined by the program
#		instance ... The instance of the test
#	@return:
#		score    ... Integer value of scored points
def evaluation(solution,instance):
	score = 0
	for i in range(len(solution)-1):
		score += metric_set(instance[solution[i]][1],instance[solution[i+1]][1])
	return score


# Writes the given path to the specified name
# Layout determined by Google
#	@param:
#		solution  ... The picture ordering
#		file_name ... The name of the file to be written
#	@return:
def write_solution(solution, file_name):
	with open(file_name,'w') as file:
		file.write('{}\n'.format(len(solution)))
		for i in range(len(solution)):
			if type(solution[i][0]) == list:
				file.write('{} {}\n'.format(solution[i][0][0],solution[i][0][1]))
			else:
				file.write('{}\n'.format(solution[i][0]))
	return


# Solver, used to contain a single run of the program
#	@param:
#		case     ... The input file
#		solution ... The output file
#	@return:
def solve_graph(case, solution):
	print('Starting with Procedure for', case.split('/')[1])
	
	start = time.time()
	pics_v,pics_h = parse_input(case)

	pics = pics_h + combine_vertical(pics_v)

	print('Preparing Graph Structure')

	s = time.time()
	g = preprocessing(pics)

	print('DONE in', time.time()-s, 'sec')
	print('Solving TSP')

	s = time.time()
	result = greedy_tsp(g)

	print('DONE in', time.time()-s, 'sec')

	name = name_path(result,pics)
	elaps = time.time() - start

	write_solution(name, solution)

	print()
	print('Best Score:', evaluation(result,pics))
	print('Time taken:', elaps, 'sec')
	print()
	return


# Solver without the graph structure
#	@param:
#		case     ... The input file
#		solution ... The output file
#	@return:
def solve(case, solution):
	print('Starting with Procedure for', case.split('/')[1])
	
	start = time.time()
	pics_v,pics_h = parse_input(case)

	pics = pics_h + combine_vertical(pics_v)

	print('Solving TSP')

	s = time.time()
	result = demand_tsp(pics)

	print('DONE in', time.time()-s, 'sec')

	name = name_path(result,pics)
	elaps = time.time() - start

	write_solution(name, solution)

	print()
	print('Best Score:', evaluation(result,pics))
	print('Time taken:', elaps, 'sec')
	print()
	return

# The main procedure, selects the test instance
# Can be switched to Auto Mode, to run through all test cases
#	@param:
#	@return:
def main():
	cases = ['input_qualification_round_2019/a_example.txt',
		 'input_qualification_round_2019/b_lovely_landscapes.txt',
		 'input_qualification_round_2019/c_memorable_moments.txt',
		 'input_qualification_round_2019/d_pet_pictures.txt',
		 'input_qualification_round_2019/e_shiny_selfies.txt',
		 'input_qualification_round_2019/f_synth1000.txt',
		 'input_qualification_round_2019/g_synth2000.txt',
		 'input_qualification_round_2019/h_synth3000.txt',
		 'input_qualification_round_2019/i_synth4000.txt',
		 'input_qualification_round_2019/j_synth5000.txt',
		 'input_qualification_round_2019/k_synth20000.txt',
		 'input_qualification_round_2019/l_synth40000.txt']
	solutions = ['a_solution.txt',
		'b_solution.txt',
		'c_solution.txt',
		'd_solution.txt',
		'e_solution.txt',
		'f_solution.txt',
		'g_solution.txt',
		'h_solution.txt',
		'i_solution.txt',
		'j_solution.txt',
		'k_solution.txt',
		'l_solution.txt']

	CASE = 4 ## IMPORTANT: chooses the test instance ##
	AUTO = False

	if AUTO:
		for CASE in range(len(cases)):
			solve(cases[CASE],solutions[CASE])
	else:
		solve(cases[CASE],solutions[CASE])


main()

'''
# Tests to find out why CASE = 1 gets killed
l1 = list(range(16))
l2 = list(range(13,38))

n = 90000
n = int((n*n-n)/2)
print(n)
s = time.time()
r = []
for i in range(n):
	r.append(0)
	#metric_set(l1,l2)
elaps = time.time() - s
print(elaps,'sec')
'''