/* 
   Compute integral from 0  to 1  4/(1+x*x). 
   This shoudl be equal to PI. 


   Parallelizing the for loop by assigning each thread a subset of the
   iterations. Use round robim (thread 0 gets iteration 0,
   NUM_THREADS, 2NUM_THREADS, ... and so on. 
   
   TO avoid race condition to sum, use an array sof sums, one for each
   thread.

   Scalability sucks. 
   
*/ 


#include <stdio.h> 
#include <stdlib.h> 
#include <omp.h>

#define NUM_THREADS 2

int main(int argc, char** argv) {

  long num_steps = 10000000; 
  double step, pi; 
  double t1, t2; 
  int global_num_threads=0, i; 

  //one sum for each thread 
  double total_sum, sum[NUM_THREADS]; 

  step = 1.0/(double)num_steps; 
  
  t1 = omp_get_wtime(); 
  
  omp_set_num_threads(NUM_THREADS); 
#pragma omp parallel  private(i)
  { 
    /* declaring these variables here makes them private */
    double x; 

    //this thread's id
    int id = omp_get_thread_num(); 

    //number of threads 
    int num_threads = omp_get_num_threads(); 
    //master sets the global num_threads 
    if (id ==0) 
      global_num_threads = num_threads; 

    //initialize its sum 
    sum[id] = 0; 

    printf("Start(%d)\n", id); 
    
    //round robin scheduling: assign thread id, iterations id, id
    //+num_threads, id+2*num_threads, ...
    for (i=id; i<num_steps; i+= num_threads) {
      //printf("thread %d: add iteration %d\n", id, i); 
      //value in the middle of the rectangle
      x = (i+.5)*step; 
      sum[id] = sum[id] + 4/(1+x*x); 
    }
    printf("Done(%d)\n", id); 
  } //end of parallel block 
  
  //sum up all partial sums
  for (i=0; i<global_num_threads; i++) 
    total_sum += sum[i]; 

  pi = step * total_sum; 

  t2 = omp_get_wtime();     
  printf("Done. PI = %.3f, THREADS=%d,  time = %.2f milliseconds\n", 
	 pi, global_num_threads,  (t2-t1)*1000);
  
  return 0; 
}