#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
#include "my_lib.h"

void processor_grid_init(pcom_data *bsd);
void gridsetup();
void processor_grid_finalize(pcom_data *bsd);
void simple_pdsyev_call(int n, int nb, pcom_data *bsd);



void processor_grid_init(pcom_data *bsd){
/* 
 *      Initialize BLACS, get processor ID (mypnum) 
 *      and the number of processors (nprocs) 
 *      NOTE: mypnum = 0, 1, .... nprocs - 1 
 */
    Cblacs_pinfo(&bsd->mypnum, &bsd->nprocs); 
    
/*
 *      find out nprow, npcol so that: 
 *      nprow * npcol = nprocs
 */ 
    gridsetup(bsd->nprocs, &bsd->nprow, &bsd->npcol); 

/*
 *      Get a default BLACS context 
 */ 
    Cblacs_get(-1, 0, &bsd->icontxt); 
 
/*
 *      Initialize a default BLACS context 
 */   
    Cblacs_gridinit(&bsd->icontxt, "R", bsd->nprow, bsd->npcol); 
    
/*
 *      Get processor coordinates = myprow, mypcol 
 */ 
    Cblacs_gridinfo(bsd->icontxt, &bsd->nprow, &bsd->npcol, &bsd->myprow, &bsd->mypcol);
 

    printf("\n\tmy id =  %d, my row =  %d, my col = %d", bsd->mypnum, bsd->myprow, bsd->mypcol);
}

void gridsetup(int nprocs, int *nprow, int *npcol){ 
/* 
 * 
 *      This subroutine factorizes the number of processors (NPROCS) 
 *      into NPROW and NPCOL,  that are the sizes of the 2D processors mesh. 
 * 
 *      Written by Carlo Cavazzoni 
 */ 
    int sqrtnp,i; 
    
    sqrtnp = (int) sqrt(nprocs)+1; 
    for(i=1;i<=sqrtnp;++i){ 
        if((nprocs%i)==0) *nprow = i; 
    }
    *npcol = nprocs/(*nprow); 
} 

		     
void processor_grid_finalize(pcom_data *bsd){
/* 
 *      Close-up Blacs context 
 */ 
    Cblacs_gridexit(bsd->icontxt); 
}


void sample_pdsyev_call(int n, int nb, pcom_data *bsd){

    int k0=0, k1=1, k6=6, dim=50;

    double t1, t2;
    double alpha=1.0, beta=0.0, sup=1.0;
    int lwork, lda;
    int i, j, ii, jj, info, ml, nl;
    int *desca, *descb, *descz;
    double *a, *w, *z, *b, *btmp, *work;
    double *workp, *rtmp;
    double wsize, aval, somma=0.0;
    char *ka="A", *kb="B", *kz="Z", kv='V', ku='U', kn='N', kt='T'; 

    Cblacs_barrier(bsd->icontxt, "A");
    if(bsd->mypnum == 0){
	printf("\n\tMatrix %dx%d and Block size %d", n, n, nb);
    }

/*
 *      Calculate the LOCAL dimensions of A
 */
    lda = numroc_(&n, &nb, &bsd->myprow, &k0, &bsd->nprow);
    ml = numroc_(&n, &nb, &bsd->mypcol, &k0, &bsd->npcol);

    a = (double *) malloc(lda*ml*sizeof(double));
    w = (double *) malloc(n*sizeof(double));
    z = (double *) malloc(lda*ml*sizeof(double));
    desca = (int *) malloc(dim*sizeof(int));
    descb = (int *) malloc(dim*sizeof(int));
    descz = (int *) malloc(dim*sizeof(int));
    workp = (double *) malloc(n*sizeof(double));
    rtmp = (double *) malloc(bsd->nprocs*sizeof(double));

/*
 *      These are basic array descriptors
 */
    descinit_(desca, &n, &n, &nb, &nb, &k0, &k0, &bsd->icontxt, &lda, &info); 
    descinit_(descb, &n, &n, &nb, &nb, &k0, &k0, &bsd->icontxt, &lda, &info);     
    descinit_(descz, &n, &n, &nb, &nb, &k0, &k0, &bsd->icontxt, &lda, &info); 
/* 
 *      Build a matrix that you can create with 
 *      a one line matlab command:  hilb(n) + diag([1:-1/n:1/n]) 
 */
    for(j=1;j<=n;++j)
	for(i=1;i<=n;++i){
	    if(i == j){   
		aval = ((double)(n-i+1))/((double)( n ))+(1.0/((double)(i+j)-1.0)); 
            }
	    else{
		aval = 1.0/(((double)(i+j ))-1.0);
            }
            pdelset_(a, &i, &j, desca, &aval); 
	}
    
    if(n<=16){
	/*
	 * print matrix on standard output
	 */
	Cpdlaprnt(&n, &n, a, &k1, &k1, desca, &k0, &k0, "A", &k6, workp); 
    }
    
    Cblacs_barrier(bsd->icontxt, "A");
    
    t1 = cclock_();
    /* 
     *    diagonalization driver subroutine
     *
     *    the first call is to compute the work size (lwork = -1)
     */
    lwork = -1;
    pdsyev_(&kv, &ku, &n, a, &k1, &k1, desca, w, z, &k1, &k1, descz, &wsize, &lwork, &info);    
    lwork = (int)(wsize)+1;
    work = (double *) malloc(lwork*sizeof(double)); 
    pdsyev_(&kv, &ku, &n, a, &k1, &k1, desca, w, z, &k1, &k1, descz, work, &lwork, &info);
    free(work);
    
    if(info!=0){
	printf("\n\t*** DIAGONALIZATION FAILED ! ***\n");
    }

    t2 = cclock_(); 
    for(i=0;i<bsd->nprocs;++i) rtmp[i] = 0.0; 
    rtmp[bsd->mypnum] = t2-t1; 
    if(bsd->mypnum == 0){ 
	printf("\n\tSeconds for PDSYEV = %g\n\n", rtmp[bsd->mypnum]); 
    }

    if(n<=16){
	/*
	 * print matrix on standard output
	 */
	 Cpdlaprnt(&n, &n, z, &k1, &k1, descz, &k0, &k0, "Z", &k6, workp); 
    }
	 
    /* 
     * Inserire nel programma scal_esercizio.f90 codice necessario a verificare  
     * l'ortogonalita' degli autovettori memorizzati nella  
     * matrice z. 
     * 
     * traccia: 
     * 1) allocare una matrice B opportunamente dimensionata 
     * 2) definire il descrittore per B 
     * 3) utilizzare la moltiplicazione di matrici parallela  
     *    SCALAPACK PDGEMM per calcolare B = Z^t Z 
     * 4) i processori verificano che il proprio sottoblocco di 
     *    B corrisponde a quello della matrice identita',  
     *    utilizzando le funzioni SCALAPACK NUMROC e INDXL2G 
     * 5) utilizzando cclock monitorare il tempo di esecuzione 
     *    di PDGEMM 
     *     
     *     Print out diagonalization time and eigenvalues  
     */ 

    b = (double *) malloc(lda*ml*sizeof(double));

    Cblacs_barrier(bsd->icontxt, "A");
    t1 = cclock_();

    pdgemm_(&kt, &kn, &n, &n, &n, &alpha, z, &k1, &k1, descz, z, &k1, &k1, descz, &beta, b, &k1, &k1, desca);

    t2 = cclock_(); 
    for(i=0;i<bsd->nprocs;++i) rtmp[i] = 0.0; 
    rtmp[bsd->mypnum] = t2-t1;
    if(bsd->mypnum == 0){ 
        for(i=0;i<3;i++) sup *= (double) (n); 
	printf("\n\tGFlop/sec for PDGEMM = %g\n\n", sup/rtmp[bsd->mypnum]*1.0e-9);
    }
    
    Cblacs_barrier(bsd->icontxt, "A");
    nl = numroc_(&n, &nb, &bsd->myprow, &k0, &bsd->nprow);
    ml = numroc_(&n, &nb, &bsd->mypcol, &k0, &bsd->npcol);

    for(j=1;j<=ml;j++)
	for(i=1;i<=nl;++i){
	    ii = indxl2g_(&i, &nb, &bsd->myprow, &k0, &bsd->nprow);
	    jj = indxl2g_(&j, &nb, &bsd->mypcol, &k0, &bsd->npcol);
	    if(ii==jj){
		if(fabs(b[(nl*(j-1))+(i-1)]-1.0) >= (1.0e-8) ) printf("\neigenvector not orthogonal\n");
	    }else{
		if(fabs(b[(nl*(j-1))+(i-1)]) >= (1.0e-8) ) printf("\neigenvector not orthogonal\n");
	    }
	}

    if(n<=16){
	Cblacs_barrier(bsd->icontxt, "A");
	Cpdlaprnt(&n, &n, b, &k1, &k1, desca, &k0, &k0, "B", &k6, workp); 
    }
    else{
	for(i=0;i<lda*ml;++i)somma+=b[i];
        printf("\n\tSum of B on pe %d = %g", bsd->mypnum, somma);
    } 
    /* 
     *     Print out diagonalization time and eigenvalues  
     */     
    Cblacs_barrier(bsd->icontxt, "A");
    if(bsd->mypnum==0){
	if(n>16) n = 16;
	for(i=0;i<n;++i)
	    printf("\n\tw[%d] = %g", i, w[i]);
    }
    
    free(a);
    free(w);
    free(z);    
}    




int main(int argc, char *argv[]){

    int n, nb; 
    
    pcom_data bsd;

    MPI_Init(&argc,&argv);

    processor_grid_init(&bsd); 

    n = 2000;  /* matrix size */ 
    nb = 120; /* block size */ 
    
    sample_pdsyev_call(n, nb, &bsd);

    processor_grid_finalize(&bsd); 

    return 1;

}
    
