/* 
   Compute integral from 0  to 1  4/(1+x*x). 
   This should be equal to PI. 


   Parallelizing the for loop by assigning each thread a subset of the
   iterations. Use round robim (thread 0 gets iteration 0,
   NUM_THREADS, 2NUM_THREADS, ... and so on. 
   
   TO avoid race condition to sum, use synchronization. 

   Scalability: Terrible. Note that most of the computation is serialized!
   
*/ 


#include <stdio.h> 
#include <stdlib.h> 
#include <omp.h>

#define NUM_THREADS 8

int main(int argc, char** argv) {

  long num_steps = 100; 
  double step, pi; 
  double t1, t2; 
  int  global_num_threads=0, i; 

  double total_sum = 0; 

  step = 1/(double)num_steps; 
  
  t1 = omp_get_wtime(); 
  
  omp_set_num_threads(NUM_THREADS); 
#pragma omp parallel  private(i)
  { 
    /* declaring these variables here makes them private */
    double x; 

    //this thread's id
    int id = omp_get_thread_num(); 

    //number of threads 
    int num_threads = omp_get_num_threads(); 
    if (id ==0) 
      global_num_threads = num_threads; 

    printf("Start(%d)\n", id); 
    
    //round robin scheduling: assign thread id, iterations id, id
    //+num_threads, id+2*num_threads, ...
    for (i=id; i<num_steps; i+= num_threads) {
      //value in the middle of the rectangle
      x = (i+.5)*step; 
#pragma omp critical       
      total_sum +=  4/(1+x*x); 
    }
    printf("Done(%d)\n", id); 
  }
  
  pi = step * total_sum; 

  t2 = omp_get_wtime();     
  printf("Done. PI = %.3f, THREADS=%d,  time = %.2f milliseconds\n", 
	 pi, global_num_threads,  (t2-t1)*1000);
  
  return 0; 
}