
#define min(X,Y) (((X) < (Y)) ? (X) : (Y))
#define max(X,Y) (((X) > (Y)) ? (X) : (Y))
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h> 
#include <sys/time.h>
#include "project.h"
#include "omp.h"

int main(int argc, char **argv)
{
	double eps = 0.001;
	double delta_t = 0.000001;
	double alpha = 1;
	int dim0_size, dim1_size;
	int num_threads;
	double **root_field;				// complete field owned by root
	double **partial_field;				// partial field where process works on
	double **partial_field_clipboard;	// copy of partial field
	t_process_info pi;
	t_process_info *infos;
	int pro_per_dim[2];
	int cell_per_pro[2];
	MPI_Comm cart_comm;
	int matrix_size[2];
	int neighbor_dim0_left, neighbor_dim0_right, neighbor_dim1_left, neighbor_dim1_right;
	double hx, hy, hx_square, hy_square;
	double *dim1_own_edge_values;
	double *dim1_neighbor_egde_values;
	MPI_Request sync_requests[9];	// 2 for each edge + 1 completion
	int iterations = 0;
	double time_start, time_iterate, time_end;

	MPI_Init(&argc, &argv);

	Process_Args(argc, argv, &num_threads, &dim0_size, &dim1_size, &eps, &delta_t);

	printf("%d threads per process\n", num_threads);
	omp_set_num_threads(num_threads);

	int rank, cart_cluster_size;
	if(MPI_Comm_rank(MPI_COMM_WORLD, &rank)) {
		fprintf(stderr, "Cannot fetch rank\n");
		exit(1);
	}

	if(rank == 0) {
		time_start = MPI_Wtime();
	}

	if(rank == 0) {
		root_field = New_Matrix(dim0_size, dim1_size);
		if (root_field == NULL) {
			fprintf(stderr, "rank %s: Can't allocate root_field !\n", rank);
			exit(1);
		}
	}

	// optimize cart cluster
	Optimize_Cart_Cluster(dim0_size, dim1_size, MPI_COMM_WORLD, rank, pro_per_dim, cell_per_pro);

	cart_comm = Create_MPI_Cart_Cluster(MPI_COMM_WORLD, rank, pro_per_dim);

	pi = Calculate_Process_Info(cart_comm, rank, dim0_size, dim1_size, cell_per_pro);

	matrix_size[0] = pi.end_m - pi.start_m + 3;
	matrix_size[1] = pi.end_n - pi.start_n + 3;

	if(MPI_Comm_size(cart_comm, &cart_cluster_size)) {
		fprintf(stderr, "Cannot fetch size of cart\n");		
		exit(1);
	}

	infos = Gather_Process_Info(&pi, rank, cart_cluster_size, cart_comm);

	if(rank == 0) {
		for(int i = 0; i < cart_cluster_size; i++) {
			Print_Process_Info(infos[i]);
		}
	}

	Alloc_Partial_Field(matrix_size, &partial_field, &partial_field_clipboard);

	Init_Neighbor_Comm(cart_comm, sync_requests, matrix_size, &neighbor_dim0_left, &neighbor_dim0_right, &neighbor_dim1_left, &neighbor_dim1_right, &dim1_own_edge_values, &dim1_neighbor_egde_values);

	Init_Jacobi(dim0_size, dim1_size, alpha, &delta_t, &hx, &hy, &hx_square, &hy_square);

	Init_Edges(dim0_size, dim1_size, matrix_size, neighbor_dim0_left, neighbor_dim0_right, neighbor_dim1_left, neighbor_dim1_right, partial_field, partial_field_clipboard, pi);

	int *completions = malloc(sizeof(int) * cart_cluster_size);

	/*
	*
	* START ITERATION
	*
	*/
	if(rank == 0) {
		time_iterate = MPI_Wtime();
	}

	while (1) {			// iterate until break;
		iterations++;

		int completion = Jacobi_Iterate(neighbor_dim0_left, neighbor_dim0_right, neighbor_dim1_left, neighbor_dim1_right, alpha, delta_t, eps, hx_square, hy_square, pi, &partial_field, &partial_field_clipboard);

		/*
		if(iterations == 2) {
			MPI_Finalize();
			return 0;
		}
		*/

		int all_completed = Sync(cart_comm, completion, completions, cart_cluster_size, matrix_size, sync_requests, neighbor_dim0_left, neighbor_dim0_right, neighbor_dim1_left, neighbor_dim1_right, partial_field, dim1_own_edge_values, dim1_neighbor_egde_values);

		if(all_completed) {
			printf("rank: %d: break after %d iterations\n", rank, iterations);
			break;
		}
	}
	if(rank == 0) {
		time_end = MPI_Wtime();
	}

	/*
	*
	* END ITERATION
	*
	*/

	if(rank == 0) {
		printf("init: %.10lf\n", time_iterate- time_start);
		printf("iterate %.10lf\n", time_end - time_iterate);
		printf("total: %.10lf\n", time_end - time_start);
	}

	Send_To_Root(cart_comm, rank, dim0_size, dim1_size, cart_cluster_size, matrix_size, infos, partial_field, root_field);

	MPI_Finalize();

	return 0;
}