
#define min(X,Y) (((X) < (Y)) ? (X) : (Y))
#define max(X,Y) (((X) > (Y)) ? (X) : (Y))
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h> 
#include <sys/time.h>
#include "project.h"

double eps = 0.001;
double delta_t = 0.000001;
double alpha = 1;
MPI_Datatype MPI_process_info;

void Create_MPI_Type_t_process_info() {
	t_process_info mock;
	MPI_Datatype type[7] = {MPI_INT, MPI_INT, MPI_INT, MPI_INT, MPI_INT, MPI_INT, MPI_INT};
	int blocklen[7] = {1, 1, 1, 1, 1, 1, 1};
	MPI_Aint disp[7];
	disp[0] = (int)&(mock.rank) - (int)&mock;
	disp[1] = (int)&(mock.coord0) - (int)&mock;
	disp[2] = (int)&(mock.coord1) - (int)&mock;
	disp[3] = (int)&(mock.start_m) - (int)&mock;
	disp[4] = (int)&(mock.start_n) - (int)&mock;
	disp[5] = (int)&(mock.end_m) - (int)&mock;
	disp[6] = (int)&(mock.end_n) - (int)&mock;
	MPI_Type_create_struct(7, blocklen, disp, type, &MPI_process_info);
	MPI_Type_commit(&MPI_process_info);
}

int main(int argc, char **argv)
{
	MPI_Init(&argc, &argv);
	int m, n;
	double **root_field;
	double **partial_field;
	double start, end;
	t_process_info pi;
	t_process_info *infos;

	Create_MPI_Type_t_process_info();

	process_args(argc, argv, &m, &n, &eps, &delta_t);

	int pid, num_p;
	if(MPI_Comm_rank(MPI_COMM_WORLD, &pid)) {
		fprintf(stderr, "Cannot fetch PID\n");
		exit(1);
	}

	if(MPI_Comm_size(MPI_COMM_WORLD, &num_p)) {
		fprintf(stderr, "Cannot fetch size of cluster\n");		
		exit(1);
	}

	if(pid == 0) {
		root_field = New_Matrix(m, n);
		if (root_field == NULL) {
			fprintf(stderr, "PID %s: Can't allocate root_field !\n", pid);
			exit(1);
		}

		Init_Matrix(root_field, m, n, 0);
	}

	if(pid == 0) {
		printf("number of processes: %d\n", num_p);
	}

	int pro_per_dim[2];
	float temp_f,temp_g;
	temp_g = sqrt((float)(num_p*m)/n);
	temp_f = num_p/temp_g;
	temp_g = floor(temp_g);
	temp_f = floor(temp_f);
	pro_per_dim[0] = (int)temp_f;
	pro_per_dim[1] = (int)temp_g;

	int m_per_pro, n_per_pro;
	m_per_pro = ceil(m/(float)pro_per_dim[1]);
	n_per_pro = ceil(n/(float)pro_per_dim[0]);

	if(pid==0) {
		printf("dim0: %d dim1: %d\n", pro_per_dim[0], pro_per_dim[1]);
		printf("size per pro: %dx%d\n", m_per_pro, n_per_pro);	
	}
	
	int periods[] = {0,0};	// edges are not connected
	MPI_Comm cart_comm;
	if(MPI_Cart_create(MPI_COMM_WORLD, 2, pro_per_dim, periods, 0, &cart_comm)) {
		fprintf(stderr, "Cannot create topology\n");		
		exit(1);
	}
	if(cart_comm == MPI_COMM_NULL) {
		printf("process %d not in use. exiting...\n", pid);
		MPI_Finalize();
		exit(0);
	}
	int coord[2];
	if(MPI_Cart_coords(cart_comm, pid, 2, coord)) {
		fprintf(stderr, "Cannot get coordinates\n");		
		exit(1);
	}

	// calculate own field using coord
	pi.start_m = coord[0] * m_per_pro;
	pi.end_m = (coord[0]+1)*m_per_pro -1;
	pi.start_n = coord[1] * n_per_pro;
	pi.end_n = (coord[1]+1) * n_per_pro -1;
	if(pi.end_m > m - 1) {
		pi.end_m = m - 1;
	}
	if(pi.end_n > n - 1) {
		pi.end_n = n - 1;
	}
	pi.coord0 = coord[0];
	pi.coord1 = coord[1];
	pi.rank = pid;

	int matrix_size[2];
	matrix_size[0] = pi.end_m - pi.start_m + 3;
	matrix_size[1] = pi.end_n - pi.start_n + 3;

	if(MPI_Comm_size(cart_comm, &num_p)) {
		fprintf(stderr, "Cannot fetch size of cart\n");		
		exit(1);
	}

	if(pid == 0) {
		infos = malloc(sizeof(t_process_info) * num_p);
	}

	if(MPI_Gather(&pi, 1, MPI_process_info, infos, 1, MPI_process_info, 0, cart_comm)) {
		fprintf(stderr, "Gather failed\n");		
		exit(1);
	}

	int i,j;

	if(pid == 0) {
		for(i = 0; i < num_p; i++) {
			printf("pid: %d->(%d,%d) from (%d, %d) to (%d,%d)\n", 
				infos[i].rank,
				infos[i].coord0,
				infos[i].coord1,
				infos[i].start_m,
				infos[i].start_n,
				infos[i].end_m,
				infos[i].end_n
			);
		}
	}

	double delta_a;
	partial_field = New_Matrix(matrix_size[0], matrix_size[1]);
	if (partial_field == NULL) {
		fprintf(stderr, "PID %d: Can't allocate partial_field %d, %d end_M: %d, start_m: %d, end_n: %d, start_n: %d!\n", 
			pid, 
			matrix_size[0], 
			matrix_size[1],
			pi.end_m,
			pi.start_m,
			pi.end_n,
			pi.start_n
		);
		exit(1);
	}

	double **partial_field_tmp = New_Matrix(matrix_size[0], matrix_size[1]);
	double **swap;
	double hx = 1.0/(double)m;
	double hy = 1.0/(double)n;
	double hx_square = hx * hx;
	double hy_square = hy * hy;

	double max_delta_t = 0.25*((min(hx,hy))*(min(hx,hy)))/alpha;  /* minimaler Wert für Konvergenz */
	if (delta_t > max_delta_t) { 
		delta_t = max_delta_t;
		if(pid == 0)
			printf ("Info: delta_t set to %.10lf.\n", delta_t);
	}

	for(i = 1; i < pi.end_m - pi.start_m + 2; i++) {		// catch edges
		for(j = 1; j < pi.end_n - pi.start_n + 2; j++) {	// catch edges
			/*
			delta_a = alpha * 
				    ( (partial_field[i+1][j] + partial_field[i-1][j] - 2.0 * partial_field[i][j]) / (hy_square)
					 +(partial_field[i][j-1] + partial_field[i][j+1] - 2.0 * partial_field[i][j]) / (hx_square) );
			delta_a = delta_a * delta_t;
			partial_field_tmp[i][j] = partial_field[i][j] + delta_a;
			
			if(delta_a > maxdiff)
				maxdiff = delta_a;
			*/
			partial_field_tmp[i][j] = 8;
		}
	}
	swap = partial_field_tmp;
	partial_field_tmp = partial_field;
	partial_field = swap;
	
	//Send_To_Root(partial_field, pi.end_m - pi.start_m + 2, pi.end_n - pi.start_n + 2);
	MPI_Send(partial_field[0], matrix_size[0]*matrix_size[1], MPI_DOUBLE, 0, 0 ,cart_comm);
	if(pid == 0) {
		MPI_Request *requests = malloc(sizeof(MPI_Request) * num_p);
		double **allocation = malloc(sizeof(double*) * num_p);
		for(i = 0; i < num_p; i++) {
			allocation[i] = malloc(sizeof(double) * (infos[i].end_m - infos[i].start_m + 3) * (infos[i].end_n - infos[i].start_n + 3));
			MPI_Irecv(allocation[i], 
				(infos[i].end_m - infos[i].start_m + 3) * (infos[i].end_n - infos[i].start_n + 3), 
				MPI_DOUBLE, 
				infos[i].rank, 
				MPI_ANY_TAG, 
				cart_comm,
				&requests[i]
			);
		}
		for(i = 0; i < num_p; i++) {
			int current;
			MPI_Waitany(num_p, requests, &current, MPI_STATUS_IGNORE);
			Insert_Array_In_Matrix(
				root_field, 
				m, 
				n, 
				infos[current].start_m, 
				infos[current].start_n, 
				allocation[current], 
				infos[current].end_m - infos[current].start_m + 3, 
				infos[current].end_n - infos[current].start_n + 3, 
				1, 1, 1, 1);
			free(allocation[current]);
		}
		free(requests);
		free(allocation);
		Write_Matrix(root_field, m, n);
	}

	// write neighbar_com function

	//MPI_Gather()
	MPI_Finalize();

	return 0;
}